Re: POC: GROUP BY optimization

Teodor Sigaev Thu, 07 Jun 2018 09:23:02 -0700

Yes. But again, this description is a bit short. First it works after
first patch and might get some preordered leading pathkeys. Second, it
tries to match ORDER BY clause order if there is no preordered leading
pathkeys from first patch (it was introduced in v7). And third, if there
is a tail of unmatched pathkeys on previous stages then it will reorder
that tail.


OK, I haven't looked at v7 yet, but if I understand correctly it tries
to maintain the ordering as much as possible? Does that actually help? I
mean, the incremental sort patch allows the sorting to happen by pieces,
but here we still need to sort all the data, right?

Can you give an example demonstrating the benefit?


See tst.sql. queries are marked with opt (optimization is on) and noopt.

Query 1: select count(*) from btg group by v, r;
Query 2: select count(*) from btg group by n, v, r order by n;

For both queries it's possible to reorder v and r column, n column has the single distinct value.


On my laptop:
Query 1opt vs 1noopt: 3177.500 ms vs 6604.493 ms
      2opt vs 2noopt: 5800.307 ms vs 7486.967 ms

So, what we see:

1) for query 1 optimization gives 2 times better performance, for query 2 only 30%. if column 'n' will be unique then time for query 1 and 2 should be the same. We could add check for preordered pathkeys in get_cheapest_group_keys_order() and if estimate_num_groups(reordered pathkeys) is close to 1 then we could do not reordering of tail of pathkeys.

2) Planing cost is the same for all queries. So, cost_sort() doesn't take into account even number of columns.

FWIW I think it would be useful to have "development GUC" that would
allow us to enable/disable these options during development, because
that makes experiments much easier. But then remove them before commit.

Added, v9, debug_enable_group_by_match_order_by and debug_enable_cheapest_group_by. I also checked compatibility with incremental sort patch, and all works except small merge conflict which could be resolved right before committing.

Next, I had a look on cost_incremental_sort() provided by incremental sort patch and, it's a pity, it doesn't solve our problem with the impact of the cost of per-column comparison function and number of its calls.



--
Teodor Sigaev                                   E-mail: teo...@sigaev.ru
                                                   WWW: http://www.sigaev.ru/

diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index b22b36ec0e..c073eb061a 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -686,7 +686,18 @@ get_eclass_for_sort_expr(PlannerInfo *root,
 
 			if (opcintype == cur_em->em_datatype &&
 				equal(expr, cur_em->em_expr))
-				return cur_ec;	/* Match! */
+			{
+				/*
+				 * Match!
+				 *
+				 * Copy sortref if it wasn't set yet, it's possible if ec was
+				 * constructed from WHERE clause, ie it doesn't have target
+				 * reference at all
+				 */
+				if (cur_ec->ec_sortref == 0 && sortref > 0)
+					cur_ec->ec_sortref = sortref;
+				return cur_ec;
+			}
 		}
 	}
 
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index ec66cb9c3c..4f93afdebc 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -26,6 +26,7 @@
 #include "optimizer/paths.h"
 #include "optimizer/tlist.h"
 #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
 
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
@@ -327,6 +328,58 @@ pathkeys_contained_in(List *keys1, List *keys2)
 	return false;
 }
 
+/*
+ * Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function
+ * returns new lists,  original GROUP BY lists stay untouched.
+ */
+int
+group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
+							   List **group_clauses)
+{
+	List		*new_group_pathkeys= NIL,
+				*new_group_clauses = NIL;
+	ListCell	*key;
+	int			n;
+
+	if (pathkeys == NIL || *group_pathkeys == NIL)
+		return 0;
+
+	/*
+	 * For each pathkey it tries to find corresponding GROUP BY pathkey and
+	 * clause.
+	 */
+	foreach(key, pathkeys)
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(key);
+		SortGroupClause	*sgc;
+
+		/*
+		 * Pathkey should use the same allocated struct, so, equiality of
+		 * pointers is enough
+		 */
+		if (!list_member_ptr(*group_pathkeys, pathkey))
+			break;
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+
+		sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+									  *group_clauses);
+		new_group_clauses = lappend(new_group_clauses, sgc);
+	}
+
+	n = list_length(new_group_pathkeys);
+
+	/*
+	 * Just append the rest of pathkeys and clauses
+	 */
+	*group_pathkeys = list_concat_unique_ptr(new_group_pathkeys,
+												 *group_pathkeys);
+	*group_clauses = list_concat_unique_ptr(new_group_clauses,
+											*group_clauses);
+
+	return n;
+}
+
 /*
  * get_cheapest_path_for_pathkeys
  *	  Find the cheapest path (according to the specified criterion) that
@@ -1611,6 +1664,39 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
 	return 0;					/* path ordering not useful */
 }
 
+/*
+ * pathkeys_useful_for_grouping
+ *		Count the number of pathkeys that are useful for grouping (instead of
+ *		explicit sort)
+ *
+ * Group pathkeys could be reordered, so we don't bother about actual order in
+ * pathkeys
+ */
+static int
+pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
+{
+	ListCell *key;
+	int		  n = 0;
+
+	if (root->group_pathkeys == NIL)
+		return 0;				/* no special ordering requested */
+
+	if (pathkeys == NIL)
+		return 0;				/* unordered path */
+
+	foreach(key, pathkeys)
+	{
+		PathKey	*pathkey = (PathKey *) lfirst(key);
+
+		if (!list_member_ptr(root->group_pathkeys, pathkey))
+			break;
+
+		n++;
+	}
+
+	return n;
+}
+
 /*
  * truncate_useless_pathkeys
  *		Shorten the given pathkey list to just the useful pathkeys.
@@ -1625,6 +1711,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
 
 	nuseful = pathkeys_useful_for_merging(root, rel, pathkeys);
 	nuseful2 = pathkeys_useful_for_ordering(root, pathkeys);
+	if (nuseful2 > nuseful)
+		nuseful = nuseful2;
+	nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
 	if (nuseful2 > nuseful)
 		nuseful = nuseful2;
 
@@ -1660,6 +1749,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel)
 {
 	if (rel->joininfo != NIL || rel->has_eclass_joins)
 		return true;			/* might be able to use pathkeys for merging */
+	if (root->group_pathkeys != NIL)
+		return true;			/* might be able to use pathkeys for grouping */ 
 	if (root->query_pathkeys != NIL)
 		return true;			/* might be able to use them for ordering */
 	return false;				/* definitely useless */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 67a2c7a581..1e7809edf2 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6182,9 +6182,28 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 		{
 			Path	   *path = (Path *) lfirst(lc);
 			bool		is_sorted;
+			List	   *group_pathkeys = root->group_pathkeys,
+					   *group_clauses = parse->groupClause;
+			int			n_preordered_groups;
+
+			if (parse->groupingSets)
+			{
+				/*
+				 * prevent group pathkey rreordering, just check the same
+				 * order paths pathkeys and group pathkeys
+				 */
+				is_sorted = pathkeys_contained_in(group_pathkeys,
+												  path->pathkeys);
+			}
+			else
+			{
+				n_preordered_groups =
+						group_keys_reorder_by_pathkeys(path->pathkeys,
+													   &group_pathkeys,
+													   &group_clauses);
+				is_sorted = (n_preordered_groups == list_length(group_pathkeys));
+			}
 
-			is_sorted = pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys);
 			if (path == cheapest_path || is_sorted)
 			{
 				/* Sort the cheapest-total path if it isn't already sorted */
@@ -6192,7 +6211,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
-													 root->group_pathkeys,
+													 group_pathkeys,
 													 -1.0);
 
 				/* Now decide what to stick atop it */
@@ -6213,9 +6232,9 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 											 grouped_rel,
 											 path,
 											 grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 group_clauses ? AGG_SORTED : AGG_PLAIN,
 											 AGGSPLIT_SIMPLE,
-											 parse->groupClause,
+											 group_clauses,
 											 havingQual,
 											 agg_costs,
 											 dNumGroups));
@@ -6230,7 +6249,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 							 create_group_path(root,
 											   grouped_rel,
 											   path,
-											   parse->groupClause,
+											   group_clauses,
 											   havingQual,
 											   dNumGroups));
 				}
@@ -6251,19 +6270,26 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			foreach(lc, partially_grouped_rel->pathlist)
 			{
 				Path	   *path = (Path *) lfirst(lc);
+				List	   *group_pathkeys = root->group_pathkeys,
+						   *group_clauses = parse->groupClause;
+				int			n_preordered_groups;
+
+				n_preordered_groups = group_keys_reorder_by_pathkeys(path->pathkeys,
+																	 &group_pathkeys,
+																	 &group_clauses);
 
 				/*
 				 * Insert a Sort node, if required.  But there's no point in
 				 * sorting anything but the cheapest path.
 				 */
-				if (!pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
+				if (n_preordered_groups != list_length(group_pathkeys))
 				{
 					if (path != partially_grouped_rel->cheapest_total_path)
 						continue;
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
-													 root->group_pathkeys,
+													 group_pathkeys,
 													 -1.0);
 				}
 
@@ -6273,9 +6299,9 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 											 grouped_rel,
 											 path,
 											 grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 group_clauses ? AGG_SORTED : AGG_PLAIN,
 											 AGGSPLIT_FINAL_DESERIAL,
-											 parse->groupClause,
+											 group_clauses,
 											 havingQual,
 											 agg_final_costs,
 											 dNumGroups));
@@ -6284,7 +6310,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 							 create_group_path(root,
 											   grouped_rel,
 											   path,
-											   parse->groupClause,
+											   group_clauses,
 											   havingQual,
 											   dNumGroups));
 			}
@@ -6521,9 +6547,15 @@ create_partial_grouping_paths(PlannerInfo *root,
 		{
 			Path	   *path = (Path *) lfirst(lc);
 			bool		is_sorted;
+			List	   *group_pathkeys = root->group_pathkeys,
+					   *group_clauses = parse->groupClause;
+			int			n_preordered_groups;
+
+			n_preordered_groups = group_keys_reorder_by_pathkeys(path->pathkeys,
+																 &group_pathkeys,
+																 &group_clauses);
+			is_sorted = (n_preordered_groups == list_length(group_pathkeys));
 
-			is_sorted = pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys);
 			if (path == cheapest_total_path || is_sorted)
 			{
 				/* Sort the cheapest partial path, if it isn't already */
@@ -6531,7 +6563,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
-													 root->group_pathkeys,
+													 group_pathkeys,
 													 -1.0);
 
 				if (parse->hasAggs)
@@ -6540,9 +6572,9 @@ create_partial_grouping_paths(PlannerInfo *root,
 											 partially_grouped_rel,
 											 path,
 											 partially_grouped_rel->reltarget,
-											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 group_clauses ? AGG_SORTED : AGG_PLAIN,
 											 AGGSPLIT_INITIAL_SERIAL,
-											 parse->groupClause,
+											 group_clauses,
 											 NIL,
 											 agg_partial_costs,
 											 dNumPartialGroups));
@@ -6551,7 +6583,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 							 create_group_path(root,
 											   partially_grouped_rel,
 											   path,
-											   parse->groupClause,
+											   group_clauses,
 											   NIL,
 											   dNumPartialGroups));
 			}
@@ -6565,17 +6597,24 @@ create_partial_grouping_paths(PlannerInfo *root,
 		{
 			Path	   *path = (Path *) lfirst(lc);
 			bool		is_sorted;
+			List	   *group_pathkeys = root->group_pathkeys,
+					   *group_clauses = parse->groupClause;
+			int			n_preordered_groups;
+
+			n_preordered_groups = group_keys_reorder_by_pathkeys(path->pathkeys,
+																 &group_pathkeys,
+																 &group_clauses);
+			is_sorted = (n_preordered_groups == list_length(group_pathkeys));
 
-			is_sorted = pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys);
 			if (path == cheapest_partial_path || is_sorted)
 			{
+
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
-													 root->group_pathkeys,
+													 group_pathkeys,
 													 -1.0);
 
 				if (parse->hasAggs)
@@ -6584,9 +6623,9 @@ create_partial_grouping_paths(PlannerInfo *root,
 													 partially_grouped_rel,
 													 path,
 													 partially_grouped_rel->reltarget,
-													 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+													 group_clauses ? AGG_SORTED : AGG_PLAIN,
 													 AGGSPLIT_INITIAL_SERIAL,
-													 parse->groupClause,
+													 group_clauses,
 													 NIL,
 													 agg_partial_costs,
 													 dNumPartialPartialGroups));
@@ -6595,7 +6634,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 									 create_group_path(root,
 													   partially_grouped_rel,
 													   path,
-													   parse->groupClause,
+													   group_clauses,
 													   NIL,
 													   dNumPartialPartialGroups));
 			}
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index cafde307ad..226b293622 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -190,6 +190,9 @@ typedef enum
 
 extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
 extern bool pathkeys_contained_in(List *keys1, List *keys2);
+extern int group_keys_reorder_by_pathkeys(List *pathkeys,
+										  List **group_pathkeys,
+										  List **group_clauses);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 							   Relids required_outer,
 							   CostSelector cost_criterion,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index f85e913850..e302dfbdce 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2065,3 +2065,107 @@ SELECT balk(hundred) FROM tenk1;
 (1 row)
 
 ROLLBACK;
+-- GROUP BY optimization by reorder columns
+SELECT
+	i AS id,
+	i/2 AS p,
+	format('%60s', i%2) AS v,
+	i/4 AS c,
+	i/8 AS d
+	INTO btg
+FROM
+	generate_series(1, 10000) i;
+CREATE INDEX ON btg(p, v);
+VACUUM btg;
+ANALYZE btg;
+SET enable_hashagg=off;
+SET max_parallel_workers= 0;
+SET max_parallel_workers_per_gather = 0;
+SET enable_seqscan=off;
+SET enable_bitmapscan=off;
+-- GROUP BY optimization by reorder columns by index scan
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v ORDER BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p ORDER BY p, v;
+                   QUERY PLAN                   
+------------------------------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Index Only Scan using btg_p_v_idx on btg
+(3 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c
+   ->  Sort
+         Sort Key: p, v, c
+         ->  Index Scan using btg_p_v_idx on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY p, v;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c
+   ->  Sort
+         Sort Key: p, v, c
+         ->  Index Scan using btg_p_v_idx on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c, d
+   ->  Sort
+         Sort Key: p, v, c, d
+         ->  Index Scan using btg_p_v_idx on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d ORDER BY p, v;
+                   QUERY PLAN                    
+-------------------------------------------------
+ GroupAggregate
+   Group Key: p, v, c, d
+   ->  Sort
+         Sort Key: p, v, c, d
+         ->  Index Scan using btg_p_v_idx on btg
+(5 rows)
+
+RESET enable_hashagg;
+RESET max_parallel_workers;
+RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
+RESET enable_bitmapscan;
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 506d0442d7..7ef703f3a7 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -907,3 +907,57 @@ EXPLAIN (COSTS OFF) SELECT balk(hundred) FROM tenk1;
 SELECT balk(hundred) FROM tenk1;
 
 ROLLBACK;
+
+-- GROUP BY optimization by reorder columns
+
+SELECT
+	i AS id,
+	i/2 AS p,
+	format('%60s', i%2) AS v,
+	i/4 AS c,
+	i/8 AS d
+	INTO btg
+FROM
+	generate_series(1, 10000) i;
+CREATE INDEX ON btg(p, v);
+
+VACUUM btg;
+ANALYZE btg;
+
+SET enable_hashagg=off;
+SET max_parallel_workers= 0;
+SET max_parallel_workers_per_gather = 0;
+SET enable_seqscan=off;
+SET enable_bitmapscan=off;
+
+-- GROUP BY optimization by reorder columns by index scan
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, c, p, d ORDER BY p, v;
+
+RESET enable_hashagg;
+RESET max_parallel_workers;
+RESET max_parallel_workers_per_gather;
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+

diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 4f93afdebc..586f8e6e79 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -380,6 +380,157 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
 	return n;
 }
 
+/************************<DEBUG PART>*************************************/
+bool debug_group_by_match_order_by = true;
+bool debug_cheapest_group_by = true;
+/************************</DEBUG PART>************************************/
+
+/*
+ * Order tail of list of group pathkeys by uniqueness descendetly. It allows to
+ * speedup sorting. Returns newly allocated lists, old ones stay untouched.
+ * n_preordered defines a head of list which order should be prevented.
+ */
+void
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+							  List *target_list,
+							  List **group_pathkeys, List **group_clauses,
+							  int n_preordered)
+{
+	struct
+	{
+		PathKey			*pathkey;
+		SortGroupClause	*sgc;
+		Node			*pathkeyExpr;
+	}
+				   *keys, tmp;
+	int				nkeys = list_length(*group_pathkeys) - n_preordered;
+	List		   *pathkeyExprList = NIL,
+				   *new_group_pathkeys = NIL,
+				   *new_group_clauses = NIL;
+	ListCell	   *cell;
+	int				i = 0, n_keys_to_est;
+
+	if (nkeys < 2)
+		return; /* nothing to do */
+
+	/*
+	 * Will try to match ORDER BY pathkeys in hope that one sort is cheaper than
+	 * two
+	 */
+	if (debug_group_by_match_order_by &&
+		n_preordered == 0 && root->sort_pathkeys)
+	{
+		n_preordered = group_keys_reorder_by_pathkeys(root->sort_pathkeys,
+													  group_pathkeys,
+													  group_clauses);
+
+		nkeys = list_length(*group_pathkeys) - n_preordered;
+		if (nkeys < 2)
+			return; /* nothing to do */
+	}
+
+	if (!debug_cheapest_group_by)
+		return;
+
+	keys = palloc(nkeys * sizeof(*keys));
+
+	/*
+	 * Collect information about pathkey for subsequent usage
+	 */
+	for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered))
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(cell);
+
+		keys[i].pathkey = pathkey;
+		keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+											  *group_clauses);
+		keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc,
+													   target_list);
+		i++;
+	}
+
+	/*
+	 * Find the cheapest to sort order of columns. We will find a first column
+	 * with bigger number of group, then pair (first column in pair is  already
+	 * defined in first step), them triple and so on.
+	 */
+	for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++)
+	{
+		ListCell   *tail_cell;
+		int			best_i = 0;
+		double		best_est_num_groups = -1;
+
+		/* expand list of columns and remeber last cell */
+		pathkeyExprList = lappend(pathkeyExprList, NULL);
+		tail_cell = list_tail(pathkeyExprList);
+
+		/*
+		 * Find the best last column - the best means bigger number of groups,
+		 * previous columns are already choosen
+		 */
+		for(i = n_keys_to_est - 1; i < nkeys; i++)
+		{
+			double  est_num_groups;
+
+			lfirst(tail_cell) = keys[i].pathkeyExpr;
+			est_num_groups = estimate_num_groups(root, pathkeyExprList,
+												 nrows, NULL);
+
+			if (est_num_groups > best_est_num_groups)
+			{
+				best_est_num_groups = est_num_groups;
+				best_i = i;
+			}
+		}
+
+		/* Save the best choice */
+		lfirst(tail_cell) = keys[best_i].pathkeyExpr;
+		if (best_i != n_keys_to_est - 1)
+		{
+			tmp = keys[n_keys_to_est - 1];
+			keys[n_keys_to_est - 1] = keys[best_i];
+			keys[best_i] = tmp;
+		}
+	}
+	list_free(pathkeyExprList);
+
+	/*
+	 * Construct result lists, keys array is already ordered to get a cheapest
+	 * sort
+	 */
+	i = 0;
+	foreach(cell, *group_pathkeys)
+	{
+		PathKey	   *pathkey;
+		SortGroupClause *sgc;
+
+		if (i < n_preordered)
+		{
+			pathkey = (PathKey *) lfirst(cell);
+			sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+										  *group_clauses);
+		}
+		else
+		{
+			pathkey = keys[i - n_preordered].pathkey;
+			sgc = keys[i - n_preordered].sgc;
+		}
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+		new_group_clauses = lappend(new_group_clauses, sgc);
+
+		i++;
+	}
+
+	pfree(keys);
+
+	/* Just append the rest GROUP BY clauses */
+	new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses);
+
+	*group_pathkeys = new_group_pathkeys;
+	*group_clauses = new_group_clauses;
+}
+
 /*
  * get_cheapest_path_for_pathkeys
  *	  Find the cheapest path (according to the specified criterion) that
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1e7809edf2..257aa5889c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6208,11 +6208,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			{
 				/* Sort the cheapest-total path if it isn't already sorted */
 				if (!is_sorted)
+				{
+					if (!parse->groupingSets)
+						get_cheapest_group_keys_order(root,
+													  path->rows,
+													  extra->targetList,
+													  &group_pathkeys,
+													  &group_clauses,
+													  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				/* Now decide what to stick atop it */
 				if (parse->groupingSets)
@@ -6286,6 +6295,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				{
 					if (path != partially_grouped_rel->cheapest_total_path)
 						continue;
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 grouped_rel,
 													 path,
@@ -6560,11 +6575,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 			{
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_path(partially_grouped_rel, (Path *)
@@ -6611,11 +6634,19 @@ create_partial_grouping_paths(PlannerInfo *root,
 
 				/* Sort the cheapest partial path, if it isn't already */
 				if (!is_sorted)
+				{
+					get_cheapest_group_keys_order(root,
+												  path->rows,
+												  extra->targetList,
+												  &group_pathkeys,
+												  &group_clauses,
+												  n_preordered_groups);
 					path = (Path *) create_sort_path(root,
 													 partially_grouped_rel,
 													 path,
 													 group_pathkeys,
 													 -1.0);
+				}
 
 				if (parse->hasAggs)
 					add_partial_path(partially_grouped_rel, (Path *)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ee1444c427..4236807433 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1822,7 +1822,26 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
-
+/************************<DEBUG OPT GROUP BY>*********************************/
+	{
+		{"debug_enable_group_by_match_order_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("enable matching GROUP BY by ORDER BY."),
+			NULL
+		},
+		&debug_group_by_match_order_by,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("find a cheapest order of columns in GROUP BY."),
+			NULL
+		},
+		&debug_cheapest_group_by,
+		true,
+		NULL, NULL, NULL
+	},
+/************************</DEBUG OPT GROUP BY>********************************/
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 226b293622..60d8d6c3a6 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -193,6 +193,16 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2);
 extern int group_keys_reorder_by_pathkeys(List *pathkeys,
 										  List **group_pathkeys,
 										  List **group_clauses);
+/************************<DEBUG OPT GROUP BY>*********************************/
+extern bool debug_group_by_match_order_by;
+extern bool debug_cheapest_group_by;
+/************************</DEBUG OPT GROUP BY>********************************/
+extern void get_cheapest_group_keys_order(PlannerInfo *root,
+										  double nrows,
+										  List *target_list,
+										  List **group_pathkeys,
+										  List **group_clauses,
+										  int	n_preordered);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 							   Relids required_outer,
 							   CostSelector cost_criterion,
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index e302dfbdce..31dcf70e47 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -2071,19 +2071,145 @@ SELECT
 	i/2 AS p,
 	format('%60s', i%2) AS v,
 	i/4 AS c,
-	i/8 AS d
+	i/8 AS d,
+	(random() * (10000/8))::int as e --the same as d but no correlation with p
 	INTO btg
 FROM
 	generate_series(1, 10000) i;
-CREATE INDEX ON btg(p, v);
 VACUUM btg;
 ANALYZE btg;
+-- GROUP BY optimization by reorder columns by frequency
 SET enable_hashagg=off;
 SET max_parallel_workers= 0;
 SET max_parallel_workers_per_gather = 0;
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Sort
+         Sort Key: p, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, v
+   ->  Sort
+         Sort Key: p, v
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, v, c
+   ->  Sort
+         Sort Key: p, v, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: v, p, c
+   ->  Sort
+         Sort Key: v, p, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: p, v, d, c
+   ->  Sort
+         Sort Key: p, v, d, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: v, p, d, c
+   ->  Sort
+         Sort Key: v, p, d, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c;
+          QUERY PLAN          
+------------------------------
+ GroupAggregate
+   Group Key: p, v, d, c
+   ->  Sort
+         Sort Key: p, v, d, c
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, d, e
+   ->  Sort
+         Sort Key: p, d, e
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, e, d
+   ->  Sort
+         Sort Key: p, e, d
+         ->  Seq Scan on btg
+(5 rows)
+
+CREATE STATISTICS btg_dep ON d, e, p FROM btg;
+ANALYZE btg;
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, e, d
+   ->  Sort
+         Sort Key: p, e, d
+         ->  Seq Scan on btg
+(5 rows)
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+         QUERY PLAN          
+-----------------------------
+ GroupAggregate
+   Group Key: p, e, d
+   ->  Sort
+         Sort Key: p, e, d
+         ->  Seq Scan on btg
+(5 rows)
+
+-- GROUP BY optimization by reorder columns by index scan
+CREATE INDEX ON btg(p, v);
 SET enable_seqscan=off;
 SET enable_bitmapscan=off;
--- GROUP BY optimization by reorder columns by index scan
+VACUUM btg;
 EXPLAIN (COSTS off)
 SELECT count(*) FROM btg GROUP BY p, v;
                    QUERY PLAN                   
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index b983f9c506..3915a837f0 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -1140,7 +1140,7 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, pl
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  GroupAggregate
-   Group Key: t1.c, t2.c, t3.c
+   Group Key: t1.c, t3.c, t2.c
    ->  Sort
          Sort Key: t1.c, t3.c
          ->  Append
@@ -1284,7 +1284,7 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, ph
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  GroupAggregate
-   Group Key: t1.c, t2.c, t3.c
+   Group Key: t1.c, t3.c, t2.c
    ->  Sort
          Sort Key: t1.c, t3.c
          ->  Append
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 054a381dad..a686a75fb0 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -244,9 +244,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c, d
+   Group Key: a, d, c, b
    ->  Sort
-         Sort Key: a, b, c, d
+         Sort Key: a, d, c, b
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -255,9 +255,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: b, c, d
+   Group Key: b, d, c
    ->  Sort
-         Sort Key: b, c, d
+         Sort Key: b, d, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -281,9 +281,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b
+   Group Key: b, a
    ->  Sort
-         Sort Key: a, b
+         Sort Key: b, a
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -292,9 +292,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c
+   Group Key: b, a, c
    ->  Sort
-         Sort Key: a, b, c
+         Sort Key: b, a, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
@@ -303,9 +303,9 @@ EXPLAIN (COSTS off)
             QUERY PLAN             
 -----------------------------------
  GroupAggregate
-   Group Key: a, b, c, d
+   Group Key: d, b, a, c
    ->  Sort
-         Sort Key: a, b, c, d
+         Sort Key: d, b, a, c
          ->  Seq Scan on ndistinct
 (5 rows)
 
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 7ef703f3a7..e4415c8d84 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -915,22 +915,65 @@ SELECT
 	i/2 AS p,
 	format('%60s', i%2) AS v,
 	i/4 AS c,
-	i/8 AS d
+	i/8 AS d,
+	(random() * (10000/8))::int as e --the same as d but no correlation with p
 	INTO btg
 FROM
 	generate_series(1, 10000) i;
-CREATE INDEX ON btg(p, v);
 
 VACUUM btg;
 ANALYZE btg;
 
+-- GROUP BY optimization by reorder columns by frequency
+
 SET enable_hashagg=off;
 SET max_parallel_workers= 0;
 SET max_parallel_workers_per_gather = 0;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, v;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+
+CREATE STATISTICS btg_dep ON d, e, p FROM btg;
+ANALYZE btg;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, d, e;
+
+EXPLAIN (COSTS off)
+SELECT count(*) FROM btg GROUP BY p, e, d;
+
+
+-- GROUP BY optimization by reorder columns by index scan
+
+CREATE INDEX ON btg(p, v);
 SET enable_seqscan=off;
 SET enable_bitmapscan=off;
+VACUUM btg;
 
--- GROUP BY optimization by reorder columns by index scan
 EXPLAIN (COSTS off)
 SELECT count(*) FROM btg GROUP BY p, v;

tst.sql
Description: application/sql

Re: POC: GROUP BY optimization

Reply via email to