From 3da12defe67efcb10ca719bd0bd64e2acf6b37fa Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Wed, 21 Feb 2024 11:43:00 +0200
Subject: [PATCH] Multiple revises for the GROUP BY reordering tests

Reported-by: Richard Guo
Discussion: https://postgr.es/m/CAMbWs4_NF0stupM8guasC_yCJxdTvRqm3a%2BJLarjvAxcH314TQ%40mail.gmail.com
---
 src/test/regress/expected/aggregates.out | 185 ++++++++++-------------
 src/test/regress/sql/aggregates.sql      |  90 ++++++-----
 2 files changed, 126 insertions(+), 149 deletions(-)

diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index dba6f230014..12460bcb7f5 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2728,29 +2728,20 @@ SELECT balk(hundred) FROM tenk1;
 (1 row)
 
 ROLLBACK;
--- GROUP BY optimization by reorder columns
+-- GROUP BY optimization by reordering GROUP BY clauses
 CREATE TABLE btg AS SELECT
-  i % 100 AS x,
-  i % 100 AS y,
+  i % 10 AS x,
+  i % 10 AS y,
   'abc' || i % 10 AS z,
   i AS w
-FROM generate_series(1,10000) AS i;
-CREATE INDEX btg_x_y_idx ON btg(x,y);
+FROM generate_series(1, 100) AS i;
+CREATE INDEX btg_x_y_idx ON btg(x, y);
 ANALYZE btg;
--- GROUP BY optimization by reorder columns by frequency
-SET enable_hashagg=off;
-SET max_parallel_workers= 0;
-SET max_parallel_workers_per_gather = 0;
--- Utilize index scan ordering to avoid a Sort operation
-EXPLAIN (COSTS OFF) SELECT count(*) FROM btg GROUP BY x,y;
-                   QUERY PLAN                   
-------------------------------------------------
- GroupAggregate
-   Group Key: x, y
-   ->  Index Only Scan using btg_x_y_idx on btg
-(3 rows)
-
-EXPLAIN (COSTS OFF) SELECT count(*) FROM btg GROUP BY y,x;
+SET enable_hashagg = off;
+SET enable_seqscan = off;
+-- Utilize the ordering of index scan to avoid a Sort operation
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY y, x;
                    QUERY PLAN                   
 ------------------------------------------------
  GroupAggregate
@@ -2759,21 +2750,11 @@ EXPLAIN (COSTS OFF) SELECT count(*) FROM btg GROUP BY y,x;
 (3 rows)
 
 -- Engage incremental sort
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY x,y,z,w;
-                   QUERY PLAN                    
--------------------------------------------------
- Group
-   Group Key: x, y, z, w
-   ->  Incremental Sort
-         Sort Key: x, y, z, w
-         Presorted Key: x, y
-         ->  Index Scan using btg_x_y_idx on btg
-(6 rows)
-
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY z,y,w,x;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY z, y, w, x;
                    QUERY PLAN                    
 -------------------------------------------------
- Group
+ GroupAggregate
    Group Key: x, y, z, w
    ->  Incremental Sort
          Sort Key: x, y, z, w
@@ -2781,35 +2762,13 @@ explain (COSTS OFF) SELECT x,y FROM btg GROUP BY z,y,w,x;
          ->  Index Scan using btg_x_y_idx on btg
 (6 rows)
 
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,z,x,y;
-                   QUERY PLAN                    
--------------------------------------------------
- Group
-   Group Key: x, y, w, z
-   ->  Incremental Sort
-         Sort Key: x, y, w, z
-         Presorted Key: x, y
-         ->  Index Scan using btg_x_y_idx on btg
-(6 rows)
-
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,x,z,y;
-                   QUERY PLAN                    
--------------------------------------------------
- Group
-   Group Key: x, y, w, z
-   ->  Incremental Sort
-         Sort Key: x, y, w, z
-         Presorted Key: x, y
-         ->  Index Scan using btg_x_y_idx on btg
-(6 rows)
-
--- Subqueries
-explain (COSTS OFF) SELECT x,y
-FROM (SELECT * FROM btg ORDER BY x,y,w,z) AS q1
-GROUP BY (w,x,z,y);
+-- Utilize the ordering of subquery scan to avoid a Sort operation
+EXPLAIN (COSTS OFF) SELECT count(*)
+FROM (SELECT * FROM btg ORDER BY x, y, w, z) AS q1
+GROUP BY w, x, z, y;
                    QUERY PLAN                    
 -------------------------------------------------
- Group
+ GroupAggregate
    Group Key: btg.x, btg.y, btg.w, btg.z
    ->  Incremental Sort
          Sort Key: btg.x, btg.y, btg.w, btg.z
@@ -2817,38 +2776,52 @@ GROUP BY (w,x,z,y);
          ->  Index Scan using btg_x_y_idx on btg
 (6 rows)
 
-explain (COSTS OFF) SELECT x,y
-FROM (SELECT * FROM btg ORDER BY x,y,w,z LIMIT 100) AS q1
-GROUP BY (w,x,z,y);
-                      QUERY PLAN                       
--------------------------------------------------------
- Group
-   Group Key: btg.x, btg.y, btg.w, btg.z
-   ->  Limit
-         ->  Incremental Sort
-               Sort Key: btg.x, btg.y, btg.w, btg.z
-               Presorted Key: btg.x, btg.y
-               ->  Index Scan using btg_x_y_idx on btg
-(7 rows)
+-- Utilize the ordering of merge join to avoid a full Sort operation
+SET enable_hashjoin = off;
+SET enable_nestloop = off;
+EXPLAIN (COSTS OFF)
+SELECT count(*)
+  FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x
+  GROUP BY t1.x, t1.y, t1.z, t1.w;
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ GroupAggregate
+   Group Key: t1.z, t1.w, t1.x, t1.y
+   ->  Incremental Sort
+         Sort Key: t1.z, t1.w, t1.x, t1.y
+         Presorted Key: t1.z, t1.w, t1.x
+         ->  Merge Join
+               Merge Cond: ((t1.z = t2.z) AND (t1.w = t2.w) AND (t1.x = t2.x))
+               ->  Sort
+                     Sort Key: t1.z, t1.w, t1.x
+                     ->  Index Scan using btg_x_y_idx on btg t1
+               ->  Sort
+                     Sort Key: t2.z, t2.w, t2.x
+                     ->  Index Scan using btg_x_y_idx on btg t2
+(13 rows)
 
+RESET enable_nestloop;
+RESET enable_hashjoin;
 -- Should work with and without GROUP-BY optimization
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,x,z,y ORDER BY y,x,z,w;
-          QUERY PLAN          
-------------------------------
- Group
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY w, x, z, y ORDER BY y, x, z, w;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
    Group Key: y, x, z, w
    ->  Sort
          Sort Key: y, x, z, w
-         ->  Seq Scan on btg
+         ->  Index Scan using btg_x_y_idx on btg
 (5 rows)
 
 -- Utilize incremental sort to make the ORDER BY rule a bit cheaper
-explain (COSTS OFF) SELECT x,w FROM btg GROUP BY w,x,y,z ORDER BY x*x,z;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY w, x, y, z ORDER BY x*x, z;
                       QUERY PLAN                       
 -------------------------------------------------------
  Sort
    Sort Key: ((x * x)), z
-   ->  Group
+   ->  GroupAggregate
          Group Key: x, y, w, z
          ->  Incremental Sort
                Sort Key: x, y, w, z
@@ -2856,23 +2829,22 @@ explain (COSTS OFF) SELECT x,w FROM btg GROUP BY w,x,y,z ORDER BY x*x,z;
                ->  Index Scan using btg_x_y_idx on btg
 (8 rows)
 
-SET enable_incremental_sort = off;
--- The case when the number of incoming subtree path keys is more than
+-- Test the case where the number of incoming subtree path keys is more than
 -- the number of grouping keys.
-CREATE INDEX idx_y_x_z ON btg(y,x,w);
+CREATE INDEX btg_y_x_w_idx ON btg(y, x, w);
 EXPLAIN (VERBOSE, COSTS OFF)
-SELECT y,x,array_agg(distinct w) FROM btg WHERE y < 0 GROUP BY x,y;
-                     QUERY PLAN                      
------------------------------------------------------
+SELECT y, x, array_agg(distinct w)
+  FROM btg WHERE y < 0 GROUP BY x, y;
+                       QUERY PLAN                        
+---------------------------------------------------------
  GroupAggregate
    Output: y, x, array_agg(DISTINCT w)
    Group Key: btg.y, btg.x
-   ->  Index Only Scan using idx_y_x_z on public.btg
+   ->  Index Only Scan using btg_y_x_w_idx on public.btg
          Output: y, x, w
          Index Cond: (btg.y < 0)
 (6 rows)
 
-RESET enable_incremental_sort;
 -- Check we don't pick aggregate path key instead of grouping path key
 CREATE TABLE group_agg_pk AS SELECT
   i % 10 AS x,
@@ -2897,37 +2869,37 @@ GROUP BY c1.w, c1.z;
 RESET enable_nestloop;
 RESET enable_hashjoin;
 DROP TABLE group_agg_pk;
--- The case, when scanning sort order correspond to aggregate sort order but
--- can not be found in the group-by list
+-- Test the case where the the ordering of scan matches the ordering within the
+-- aggregate but cannot be found in the group-by list
 CREATE TABLE agg_sort_order (c1 int PRIMARY KEY, c2 int);
-CREATE UNIQUE INDEX ON agg_sort_order(c2);
-explain (costs off)
+CREATE UNIQUE INDEX agg_sort_order_c2_idx ON agg_sort_order(c2);
+INSERT INTO agg_sort_order SELECT i, i FROM generate_series(1,100)i;
+ANALYZE agg_sort_order;
+EXPLAIN (COSTS OFF)
 SELECT array_agg(c1 ORDER BY c2),c2
 FROM agg_sort_order WHERE c2 < 100 GROUP BY c1 ORDER BY 2;
-                             QUERY PLAN                             
---------------------------------------------------------------------
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
  Sort
    Sort Key: c2
    ->  GroupAggregate
          Group Key: c1
          ->  Sort
                Sort Key: c1, c2
-               ->  Bitmap Heap Scan on agg_sort_order
-                     Recheck Cond: (c2 < 100)
-                     ->  Bitmap Index Scan on agg_sort_order_c2_idx
-                           Index Cond: (c2 < 100)
-(10 rows)
+               ->  Index Scan using agg_sort_order_c2_idx on agg_sort_order
+                     Index Cond: (c2 < 100)
+(8 rows)
 
 DROP TABLE agg_sort_order CASCADE;
 -- Check, that GROUP-BY reordering optimization can operate with pathkeys, built
--- by planner itself. For example, by MergeJoin.
+-- by planner itself.  For example, by MergeJoin.
 SET enable_hashjoin = off;
 SET enable_nestloop = off;
-explain (COSTS OFF)
-SELECT b1.x,b1.w FROM btg b1 JOIN btg b2 ON (b1.z=b2.z AND b1.w=b2.w)
-GROUP BY b1.x,b1.z,b1.w ORDER BY b1.z, b1.w, b1.x*b1.x;
-                            QUERY PLAN                             
--------------------------------------------------------------------
+EXPLAIN (COSTS OFF)
+SELECT b1.x, b1.w FROM btg b1 JOIN btg b2 ON (b1.z = b2.z AND b1.w = b2.w)
+GROUP BY b1.x, b1.z, b1.w ORDER BY b1.z, b1.w, b1.x*b1.x;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
  Incremental Sort
    Sort Key: b1.z, b1.w, ((b1.x * b1.x))
    Presorted Key: b1.z, b1.w
@@ -2940,7 +2912,7 @@ GROUP BY b1.x,b1.z,b1.w ORDER BY b1.z, b1.w, b1.x*b1.x;
                      Merge Cond: ((b1.z = b2.z) AND (b1.w = b2.w))
                      ->  Sort
                            Sort Key: b1.z, b1.w
-                           ->  Seq Scan on btg b1
+                           ->  Index Scan using btg_x_y_idx on btg b1
                      ->  Sort
                            Sort Key: b2.z, b2.w
                            ->  Seq Scan on btg b2
@@ -2950,8 +2922,7 @@ RESET enable_hashjoin;
 RESET enable_nestloop;
 DROP TABLE btg;
 RESET enable_hashagg;
-RESET max_parallel_workers;
-RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
 -- Secondly test the case of a parallel aggregate combiner function
 -- returning NULL. For that use normal transition function, but a
 -- combiner function returning NULL.
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index d6ed5d0effa..3c75c2aaa65 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1181,53 +1181,56 @@ SELECT balk(hundred) FROM tenk1;
 
 ROLLBACK;
 
--- GROUP BY optimization by reorder columns
+-- GROUP BY optimization by reordering GROUP BY clauses
 CREATE TABLE btg AS SELECT
-  i % 100 AS x,
-  i % 100 AS y,
+  i % 10 AS x,
+  i % 10 AS y,
   'abc' || i % 10 AS z,
   i AS w
-FROM generate_series(1,10000) AS i;
-CREATE INDEX btg_x_y_idx ON btg(x,y);
+FROM generate_series(1, 100) AS i;
+CREATE INDEX btg_x_y_idx ON btg(x, y);
 ANALYZE btg;
 
--- GROUP BY optimization by reorder columns by frequency
-
-SET enable_hashagg=off;
-SET max_parallel_workers= 0;
-SET max_parallel_workers_per_gather = 0;
+SET enable_hashagg = off;
+SET enable_seqscan = off;
 
--- Utilize index scan ordering to avoid a Sort operation
-EXPLAIN (COSTS OFF) SELECT count(*) FROM btg GROUP BY x,y;
-EXPLAIN (COSTS OFF) SELECT count(*) FROM btg GROUP BY y,x;
+-- Utilize the ordering of index scan to avoid a Sort operation
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY y, x;
 
 -- Engage incremental sort
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY x,y,z,w;
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY z,y,w,x;
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,z,x,y;
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,x,z,y;
-
--- Subqueries
-explain (COSTS OFF) SELECT x,y
-FROM (SELECT * FROM btg ORDER BY x,y,w,z) AS q1
-GROUP BY (w,x,z,y);
-explain (COSTS OFF) SELECT x,y
-FROM (SELECT * FROM btg ORDER BY x,y,w,z LIMIT 100) AS q1
-GROUP BY (w,x,z,y);
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY z, y, w, x;
+
+-- Utilize the ordering of subquery scan to avoid a Sort operation
+EXPLAIN (COSTS OFF) SELECT count(*)
+FROM (SELECT * FROM btg ORDER BY x, y, w, z) AS q1
+GROUP BY w, x, z, y;
+
+-- Utilize the ordering of merge join to avoid a full Sort operation
+SET enable_hashjoin = off;
+SET enable_nestloop = off;
+EXPLAIN (COSTS OFF)
+SELECT count(*)
+  FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x
+  GROUP BY t1.x, t1.y, t1.z, t1.w;
+RESET enable_nestloop;
+RESET enable_hashjoin;
 
 -- Should work with and without GROUP-BY optimization
-explain (COSTS OFF) SELECT x,y FROM btg GROUP BY w,x,z,y ORDER BY y,x,z,w;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY w, x, z, y ORDER BY y, x, z, w;
 
 -- Utilize incremental sort to make the ORDER BY rule a bit cheaper
-explain (COSTS OFF) SELECT x,w FROM btg GROUP BY w,x,y,z ORDER BY x*x,z;
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM btg GROUP BY w, x, y, z ORDER BY x*x, z;
 
-SET enable_incremental_sort = off;
--- The case when the number of incoming subtree path keys is more than
+-- Test the case where the number of incoming subtree path keys is more than
 -- the number of grouping keys.
-CREATE INDEX idx_y_x_z ON btg(y,x,w);
+CREATE INDEX btg_y_x_w_idx ON btg(y, x, w);
 EXPLAIN (VERBOSE, COSTS OFF)
-SELECT y,x,array_agg(distinct w) FROM btg WHERE y < 0 GROUP BY x,y;
-RESET enable_incremental_sort;
+SELECT y, x, array_agg(distinct w)
+  FROM btg WHERE y < 0 GROUP BY x, y;
 
 -- Check we don't pick aggregate path key instead of grouping path key
 CREATE TABLE group_agg_pk AS SELECT
@@ -1248,29 +1251,32 @@ RESET enable_nestloop;
 RESET enable_hashjoin;
 DROP TABLE group_agg_pk;
 
--- The case, when scanning sort order correspond to aggregate sort order but
--- can not be found in the group-by list
+-- Test the case where the the ordering of scan matches the ordering within the
+-- aggregate but cannot be found in the group-by list
 CREATE TABLE agg_sort_order (c1 int PRIMARY KEY, c2 int);
-CREATE UNIQUE INDEX ON agg_sort_order(c2);
-explain (costs off)
+CREATE UNIQUE INDEX agg_sort_order_c2_idx ON agg_sort_order(c2);
+INSERT INTO agg_sort_order SELECT i, i FROM generate_series(1,100)i;
+ANALYZE agg_sort_order;
+
+EXPLAIN (COSTS OFF)
 SELECT array_agg(c1 ORDER BY c2),c2
 FROM agg_sort_order WHERE c2 < 100 GROUP BY c1 ORDER BY 2;
+
 DROP TABLE agg_sort_order CASCADE;
 
 -- Check, that GROUP-BY reordering optimization can operate with pathkeys, built
--- by planner itself. For example, by MergeJoin.
+-- by planner itself.  For example, by MergeJoin.
 SET enable_hashjoin = off;
 SET enable_nestloop = off;
-explain (COSTS OFF)
-SELECT b1.x,b1.w FROM btg b1 JOIN btg b2 ON (b1.z=b2.z AND b1.w=b2.w)
-GROUP BY b1.x,b1.z,b1.w ORDER BY b1.z, b1.w, b1.x*b1.x;
+EXPLAIN (COSTS OFF)
+SELECT b1.x, b1.w FROM btg b1 JOIN btg b2 ON (b1.z = b2.z AND b1.w = b2.w)
+GROUP BY b1.x, b1.z, b1.w ORDER BY b1.z, b1.w, b1.x*b1.x;
 RESET enable_hashjoin;
 RESET enable_nestloop;
 
 DROP TABLE btg;
 RESET enable_hashagg;
-RESET max_parallel_workers;
-RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
 
 -- Secondly test the case of a parallel aggregate combiner function
 -- returning NULL. For that use normal transition function, but a
-- 
2.39.3 (Apple Git-145)