Multiple grouping set specs referencing duplicate alias

David Kimura Fri, 21 Oct 2022 13:25:19 -0700

Hi all!

I think I may have stumbled across a case of wrong results on HEAD (same
through version 9.6, though interestingly 9.5 produces different results
altogether).


test=# SELECT i AS ai1, i AS ai2 FROM generate_series(1,3)i GROUP BY
ai2, ROLLUP(ai1) ORDER BY ai1, ai2;

 ai1 | ai2
-----+-----
   1 |   1
     |   1
   2 |   2
     |   2
   3 |   3
     |   3
(6 rows)

I had expected:

 ai1 | ai2
-----+-----
   1 |   1
   2 |   2
   3 |   3
     |   1
     |   2
     |   3
(6 rows)

It seems to me that the plan is missing a Sort node (on ai1 and ai2) above the
Aggregate node.

                   QUERY PLAN
------------------------------------------------
 GroupAggregate
   Group Key: i, i
   Group Key: i
   ->  Sort
         Sort Key: i
         ->  Function Scan on generate_series i

I have a hunch part of the issue may be an assumption that a duplicate aliased
column will produce the same column values and hence isn't included in the
range table, nor subsequently the pathkeys. However, that assumption does not
seem to be true for queries with multiple grouping set specifications:

test=#  SELECT i as ai1, i as ai2 FROM (values (1),(2),(3)) v(i) GROUP
BY ai1, ROLLUP(ai2);
 ai1 | ai2
-----+-----
   1 |   1
   2 |   2
   3 |   3
   1 |
   2 |
   3 |
(6 rows)

It seems to me that excluding the duplicate alias from the pathkeys is leading
to a case where the group order is incorrectly determined to satisfy the sort
order. Thus create_ordered_paths() decides against applying an explicit sort
node. But simply forcing an explicit sort still seems wrong since we've
effectively lost a relevant column for the sort.

I tinkered a bit and hacked together an admittedly ugly patch that triggers an
explicit sort constructed from the parse tree. An alternative approach I had
considered was to update the rewriteHandler to explicitly force the existence of
the duplicate alias column in the range tables. But that also felt meh.

Does this seem like a legit issue? And if so, any thoughts on alternative
approaches?

Thanks,
David Kimura

commit 7e7b9a0dce1ba60f4fc087082f04b04e7f405539
Author: David Kimura <dkimura@vmware.com>
Date:   Wed Oct 19 21:32:27 2022 +0000

    Fix multiple grouping specs using duplicate alias bug
    
    There seems to have been an assumption that multiple aliases that refer
    to the same column would always produce mirrored results. However, that
    does not seem to be the case when the aliased columns are passed through
    different grouping set specs. Consider the case:
    
      SELECT i AS alias1, i AS alias2 FROM generate_series(1,3)i GROUP BY alias2, ROLLUP(alias1);
    
    This led to a scenario where a relevant column is collapsed at parse
    time and thus excluded from sort pathkeys. Consequently group order
    could then incorrectly decide it satisfied the sort order. Which leads
    to create_ordered_paths() forgoing an explicit sort node.
    
    A solution to the issue is to force an explicit sort and revive the
    relevant column info.

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index ac86ce9003..0b81d4d211 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -2157,6 +2157,46 @@ change_plan_targetlist(Plan *subplan, List *tlist, bool tlist_parallel_safe)
 	return subplan;
 }
 
+/*
+ * reevaluate_sort_info
+ *
+ * Build the sort node info from the parse tree.
+ */
+static void
+reevaluate_sort_info(Query *parse, Sort *plan)
+{
+	int			i = 0;
+	ListCell   *l1;
+	ListCell   *l2;
+
+	plan->numCols = list_length(parse->sortClause);
+
+	plan->sortColIdx = palloc(sizeof(int) * plan->numCols);
+	plan->sortOperators = palloc(sizeof(int) * plan->numCols);
+	plan->collations = palloc(sizeof(int) * plan->numCols);
+	plan->nullsFirst = palloc(sizeof(int) * plan->numCols);
+
+	foreach(l1, parse->sortClause)
+	{
+		foreach(l2, parse->targetList)
+		{
+			SortGroupClause *sortClause = (SortGroupClause *) lfirst(l1);
+			TargetEntry *targetEntry = (TargetEntry *) lfirst(l2);
+
+			if (sortClause->tleSortGroupRef == targetEntry->ressortgroupref)
+			{
+				plan->sortColIdx[i] = targetEntry->resno;
+				plan->sortOperators[i] = sortClause->sortop;
+				plan->collations[i] = exprCollation((Node *) targetEntry->expr);
+				plan->nullsFirst[i] = sortClause->nulls_first;
+
+				i++;
+				break;
+			}
+		}
+	}
+}
+
 /*
  * create_sort_plan
  *
@@ -2187,6 +2227,9 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 								   IS_OTHER_REL(best_path->subpath->parent) ?
 								   best_path->path.parent->relids : NULL);
 
+	if (best_path->need_evaluation)
+		reevaluate_sort_info(root->parse, plan);
+
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
 	return plan;
@@ -2213,6 +2256,9 @@ create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path,
 											  best_path->spath.path.parent->relids : NULL,
 											  best_path->nPresortedCols);
 
+	if (best_path->spath.need_evaluation)
+		reevaluate_sort_info(root->parse, &plan->sort);
+
 	copy_generic_path_info(&plan->sort.plan, (Path *) best_path);
 
 	return plan;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 5d0fd6e072..433fbddd06 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4852,6 +4852,60 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 	return distinct_rel;
 }
 
+/*
+ * path_is_sorted
+ *
+ * Check whether the Path satisfies sort requirement or needs evaulation.
+ *
+ * Under normal circumstances it is sufficient to simply check that the
+ * grouping path is at least as sorted as the sort path. However, this is not
+ * the case for multiple grouping sets using duplicate alias columns. The
+ * reason is because the parser removes the duplicate column from the projected
+ * range table targets, which in turn removes it from the group and sort paths.
+ * In that case, require an explict sort that needs evaulation.
+ */
+static bool
+path_is_sorted(PlannerInfo *root,
+			   Path *input_path,
+			   int *presorted_keys,
+			   bool *need_evaluation)
+{
+	*need_evaluation = false;
+
+	if (IsA(input_path, GroupingSetsPath))
+	{
+		ListCell   *lc1,
+				   *lc2,
+				   *lc3;
+		GroupingSetsPath *gspath = (GroupingSetsPath *) input_path;
+
+		if (list_length(gspath->rollups) > 1)
+			*need_evaluation = true;
+
+		foreach(lc1, gspath->rollups)
+		{
+			List	   *groupClause = ((RollupData *) lfirst(lc1))->groupClause;
+
+			if (list_length(groupClause) == list_length(root->parse->sortClause))
+			{
+				forboth(lc2, groupClause, lc3, root->parse->sortClause)
+				{
+					if (!equal(lfirst(lc2), lfirst(lc3)))
+					{
+						*need_evaluation = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	return pathkeys_count_contained_in(root->sort_pathkeys,
+									   input_path->pathkeys, presorted_keys) &&
+		!(*need_evaluation);
+
+}
+
 /*
  * create_ordered_paths
  *
@@ -4904,10 +4958,12 @@ create_ordered_paths(PlannerInfo *root,
 		Path	   *input_path = (Path *) lfirst(lc);
 		Path	   *sorted_path = input_path;
 		bool		is_sorted;
+		bool		need_evaluation;
 		int			presorted_keys;
 
-		is_sorted = pathkeys_count_contained_in(root->sort_pathkeys,
-												input_path->pathkeys, &presorted_keys);
+		is_sorted = path_is_sorted(root, input_path,
+								   &presorted_keys,
+								   &need_evaluation);
 
 		if (is_sorted)
 		{
@@ -4936,6 +4992,8 @@ create_ordered_paths(PlannerInfo *root,
 														input_path,
 														root->sort_pathkeys,
 														limit_tuples);
+				((SortPath *) sorted_path)->need_evaluation = need_evaluation;
+
 				/* Add projection step if needed */
 				if (sorted_path->pathtarget != target)
 					sorted_path = apply_projection_to_path(root, ordered_rel,
@@ -4965,6 +5023,7 @@ create_ordered_paths(PlannerInfo *root,
 																root->sort_pathkeys,
 																presorted_keys,
 																limit_tuples);
+			((IncrementalSortPath *) sorted_path)->spath.need_evaluation = need_evaluation;
 
 			/* Add projection step if needed */
 			if (sorted_path->pathtarget != target)
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 70f61ae7b1..1aed34f35c 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2932,6 +2932,7 @@ create_incremental_sort_path(PlannerInfo *root,
 	pathnode->path.pathkeys = pathkeys;
 
 	pathnode->subpath = subpath;
+	pathnode->need_evaluation = false;
 
 	cost_incremental_sort(&pathnode->path,
 						  root, pathkeys, presorted_keys,
@@ -2979,6 +2980,7 @@ create_sort_path(PlannerInfo *root,
 	pathnode->path.pathkeys = pathkeys;
 
 	pathnode->subpath = subpath;
+	pathnode->need_evaluation = false;
 
 	cost_sort(&pathnode->path, root, pathkeys,
 			  subpath->total_cost,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 6bda383bea..939711f777 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -2065,6 +2065,8 @@ typedef struct SortPath
 {
 	Path		path;
 	Path	   *subpath;		/* path representing input source */
+
+	bool		need_evaluation;	/* if sort must re-build from parse tree */
 } SortPath;
 
 /*
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index fcad5c4093..6a09232d94 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -2151,4 +2151,154 @@ select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1;
         0
 (1 row)
 
+-- test duplicate alias used in multiple grouping set specs and sort ordering
+set enable_hashagg=off;
+explain (costs off)
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Sort
+   Sort Key: "*VALUES*".column1, "*VALUES*".column1
+   ->  GroupAggregate
+         Group Key: "*VALUES*".column1, "*VALUES*".column1
+         Group Key: "*VALUES*".column1
+         ->  Sort
+               Sort Key: "*VALUES*".column1
+               ->  Values Scan on "*VALUES*"
+(8 rows)
+
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   2 |   2
+   3 |   3
+   1 |    
+   2 |    
+   3 |    
+(6 rows)
+
+explain (costs off)
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ GroupAggregate
+   Group Key: "*VALUES*".column1, "*VALUES*".column1
+   Group Key: "*VALUES*".column1
+   ->  Sort
+         Sort Key: "*VALUES*".column1
+         ->  Values Scan on "*VALUES*"
+(6 rows)
+
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   1 |    
+   2 |   2
+   2 |    
+   3 |   3
+   3 |    
+(6 rows)
+
+explain (costs off)
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Sort
+   Sort Key: i, i
+   ->  GroupAggregate
+         Group Key: i, i
+         Group Key: i
+         ->  Sort
+               Sort Key: i
+               ->  Function Scan on generate_series i
+(8 rows)
+
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   2 |   2
+   3 |   3
+   1 |    
+   2 |    
+   3 |    
+(6 rows)
+
+explain (costs off)
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: i, i
+   Group Key: i
+   ->  Sort
+         Sort Key: i
+         ->  Function Scan on generate_series i
+(6 rows)
+
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   1 |    
+   2 |   2
+   2 |    
+   3 |   3
+   3 |    
+(6 rows)
+
+create table gs_dup_alias as
+select i from generate_series(1, 3) i;
+explain (costs off)
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1;
+                 QUERY PLAN                 
+--------------------------------------------
+ Incremental Sort
+   Sort Key: i, i
+   Presorted Key: i
+   ->  GroupAggregate
+         Group Key: i, i
+         Group Key: i
+         ->  Sort
+               Sort Key: i
+               ->  Seq Scan on gs_dup_alias
+(9 rows)
+
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   2 |   2
+   3 |   3
+   1 |    
+   2 |    
+   3 |    
+(6 rows)
+
+explain (costs off)
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2;
+              QUERY PLAN              
+--------------------------------------
+ GroupAggregate
+   Group Key: i, i
+   Group Key: i
+   ->  Sort
+         Sort Key: i
+         ->  Seq Scan on gs_dup_alias
+(6 rows)
+
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2;
+ ai1 | ai2 
+-----+-----
+   1 |   1
+   1 |    
+   2 |   2
+   2 |    
+   3 |   3
+   3 |    
+(6 rows)
+
+reset enable_hashagg;
 -- end
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index 90ba27257a..36c9d7a4a8 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -589,4 +589,31 @@ explain (costs off)
 select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1;
 select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1;
 
+-- test duplicate alias used in multiple grouping set specs and sort ordering
+set enable_hashagg=off;
+explain (costs off)
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1;
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1;
+explain (costs off)
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2;
+select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2;
+
+explain (costs off)
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1;
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1;
+explain (costs off)
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2;
+select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2;
+
+create table gs_dup_alias as
+select i from generate_series(1, 3) i;
+explain (costs off)
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1;
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1;
+explain (costs off)
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2;
+select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2;
+
+reset enable_hashagg;
+
 -- end

Multiple grouping set specs referencing duplicate alias

Reply via email to