On Fri, Nov 22, 2024 at 9:24 PM jian he <[email protected]> wrote:
>
> overall i come up with the attached patch.
in v2:
create table t1 (a int, b int not null, c int, unique(b, c));
explain(costs off) select count(*) from t1 group by b,c,a;
QUERY PLAN
----------------------
HashAggregate
Group Key: b
-> Seq Scan on t1
so the v2 implementation is wrong.
I overlooked cases like: only part of the unique-key is not-null.
In that situation, we cannot remove useless groupby columns.
v3 attached. in v3:
QUERY PLAN
----------------------
HashAggregate
Group Key: b, c, a
-> Seq Scan on t1
From 85a64d350f0282c4bfb79a0f70d57eca29a78308 Mon Sep 17 00:00:00 2001
From: jian he <[email protected]>
Date: Wed, 27 Nov 2024 14:45:03 +0800
Subject: [PATCH v3 1/1] remove useless group by columns via unique-not-null
in remove_useless_groupby_columns, if primary key exists and is a subset of
group by columns. then we remove non-primary-key column in group-by clause.
if primary key is not available, then we try to use columns that are unique and
not-null to remove other useless columns.
discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark
---
src/backend/catalog/index.c | 94 +++++++++++++++++++++++
src/backend/optimizer/plan/planner.c | 74 +++++++++++++++++-
src/include/catalog/index.h | 1 +
src/test/regress/expected/aggregates.out | 96 ++++++++++++++++++++++++
src/test/regress/sql/aggregates.sql | 47 ++++++++++++
5 files changed, 309 insertions(+), 3 deletions(-)
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..cd5b1ff3e6 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,97 @@ RestoreReindexState(const void *reindexstate)
/* Note the worker has its own transaction nesting level */
reindexingNestLevel = GetCurrentTransactionNestLevel();
}
+
+/*
+ * given a input relation oid, return a list of attnum Bitmapset(soure:
+ * pg_index.indkey) that have unique index *and* not-null constraint associate
+ * with it. note we do not check this relation's primary key.
+*/
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+ List *not_null_attnos = NIL;
+ Relation pg_index;
+ HeapTuple indexTuple;
+ SysScanDesc scan;
+ ScanKeyData skey;
+ List *not_null_cs;
+ List *tlist = NIL;
+
+ /* Use CookedConstraint to fetch not-null constraint. */
+ not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+ if (not_null_cs == NIL)
+ return NULL;
+
+ foreach_ptr(CookedConstraint, cc, not_null_cs)
+ not_null_attnos = lappend_int(not_null_attnos, cc->attnum);
+
+ /* Scan pg_index for unique index of the target rel */
+ pg_index = table_open(IndexRelationId, AccessShareLock);
+
+ ScanKeyInit(&skey,
+ Anum_pg_index_indrelid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(relid));
+ scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+ NULL, 1, &skey);
+
+ while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+ {
+ Bitmapset *unique_notnull_attnos = NULL;
+ Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+ Datum adatum;
+ bool isNull;
+ bool all_notnull = true;
+ int numkeys;
+ Datum *keys;
+ int nKeys;
+
+ /* we're only interested in unique index */
+ if (!index->indisunique || index->indisprimary)
+ continue;
+
+ /* Skip invalid, exclusion index or deferred index */
+ if (!index->indisvalid || index->indisexclusion || !index->indimmediate)
+ continue;
+
+ /* Skip expression index or predicate index */
+ if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+ !heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+ continue;
+
+ /* Extract the pg_index->indkey int2vector */
+ adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+ RelationGetDescr(pg_index), &isNull);
+ if (isNull)
+ elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+ deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+ &keys, NULL, &nKeys);
+
+ if(nKeys != index->indnatts)
+ elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+ Assert(nKeys >= index->indnkeyatts);
+ numkeys = index->indnkeyatts;
+
+ for (int i = 0; i < numkeys; i++)
+ {
+ /* Skip if any of unique key's attnum don't have not-null */
+ if (!list_member_int(not_null_attnos, DatumGetInt16(keys[i])))
+ {
+ all_notnull = false;
+ break;
+ }
+
+ unique_notnull_attnos = bms_add_member(unique_notnull_attnos,
+ DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber);
+ }
+ if (all_notnull)
+ tlist = lappend(tlist, unique_notnull_attnos);
+ }
+ systable_endscan(scan);
+
+ table_close(pg_index, AccessShareLock);
+ return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..39d92b2b53 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
#include "catalog/pg_inherits.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
+#include "catalog/index.h"
#include "executor/executor.h"
#include "foreign/fdwapi.h"
#include "jit/jit.h"
@@ -2807,6 +2808,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
Bitmapset *relattnos;
Bitmapset *pkattnos;
Oid constraintOid;
+ Bitmapset *attnums = NULL;
+ List *unique_notnull_attnos = NIL;
+ List *matched = NIL;
relid++;
@@ -2828,18 +2832,69 @@ remove_useless_groupby_columns(PlannerInfo *root)
continue;
/*
- * Can't remove any columns for this rel if there is no suitable
- * (i.e., nondeferrable) primary key constraint.
+ * Can't remove any columns for this rel if there is no suitable (i.e.,
+ * nondeferrable) primary key constraint. However if there is columns
+ * both have unique-index and not-null constraint, we can also remove
+ * some columns. If primary key is there, we won't looking for
+ * unique-not-null constraints.
*/
pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
if (pkattnos == NULL)
+ {
+ unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
+ if (list_length(unique_notnull_attnos) > 0)
+ {
+ int out_num,
+ inner_num;
+
+ foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+ {
+ Assert(attnos != NULL);
+
+ if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+ continue;
+ else
+ matched = lappend(matched, attnos);
+ }
+ /*
+ * There may be multiple unique-index-not-null attnums that are
+ * subsets of groupbyattnos. We select the one with the fewest
+ * members. If there are several with the same number of
+ * members , we choose the first one. We avoid using `foreach`
+ * here because we delete items from the list.
+ */
+ for (int outerpos = 0; outerpos < list_length(matched); outerpos++)
+ {
+ Bitmapset *bt;
+ bt = list_nth_node(Bitmapset, matched, outerpos);
+
+ out_num = bms_num_members(bt);
+ for (int restpos = outerpos + 1; restpos < list_length(matched);)
+ {
+ Bitmapset *other;
+ other = list_nth_node(Bitmapset, matched, restpos);
+
+ inner_num = bms_num_members(other);
+ if (inner_num > out_num)
+ matched = list_delete_nth_cell(matched, restpos);
+ else if (inner_num < out_num)
+ matched = list_delete_nth_cell(matched, outerpos);
+ restpos++;
+ }
+ }
+ /* If there is at least one candidate, select the first one. */
+ if (list_length(matched) > 0)
+ attnums = list_nth_node(Bitmapset, matched, 0);
+ }
+ }
+ if ((pkattnos == NULL && attnums == NULL))
continue;
/*
* If the primary key is a proper subset of relattnos then we have
* some items in the GROUP BY that can be removed.
*/
- if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+ if (pkattnos != NULL && bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
{
/*
* To easily remember whether we've found anything to do, we don't
@@ -2852,6 +2907,19 @@ remove_useless_groupby_columns(PlannerInfo *root)
/* Remember the attnos of the removable columns */
surplusvars[relid] = bms_difference(relattnos, pkattnos);
}
+ /*
+ * If attnums is not NULL then this attnums (unique-index-not-null) is a
+ * proper subset of relattnos. see above. we can remove some items in
+ * GROUOP BY
+ */
+ else if(attnums != NULL)
+ {
+ if (surplusvars == NULL)
+ surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+ (list_length(parse->rtable) + 1));
+
+ surplusvars[relid] = bms_difference(relattnos, attnums);
+ }
}
/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
+extern List *get_unique_not_null_attnos(Oid relid);
/*
* itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..563b5c1c0d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,102 @@ drop table t2;
drop table t3;
drop table p_t1;
--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+ c int not null, d int not null,
+ unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+ c int not null, d int not null,
+ unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+ QUERY PLAN
+----------------------
+ HashAggregate
+ Group Key: a, b, c
+ -> Seq Scan on t1
+(3 rows)
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+ QUERY PLAN
+----------------------
+ HashAggregate
+ Group Key: a, b
+ -> Seq Scan on t2
+(3 rows)
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+ QUERY PLAN
+----------------------
+ HashAggregate
+ Group Key: a, b
+ -> Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+ QUERY PLAN
+----------------------
+ HashAggregate
+ Group Key: a, b
+ -> Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+ QUERY PLAN
+----------------------
+ HashAggregate
+ Group Key: c, b
+ -> Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+ QUERY PLAN
+-----------------------------------
+ HashAggregate
+ Group Key: t5.a, t5.b
+ -> Append
+ -> Seq Scan on t5_0 t5_1
+ -> Seq Scan on t5_1 t5_2
+(5 rows)
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+ QUERY PLAN
+-----------------------------------
+ HashAggregate
+ Group Key: t5.b, t5.a
+ -> Append
+ -> Seq Scan on t5_0 t5_1
+ -> Seq Scan on t5_1 t5_2
+(5 rows)
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+ QUERY PLAN
+-------------------------
+ HashAggregate
+ Group Key: b, c, a, d
+ -> Seq Scan on t6
+(3 rows)
+
+drop table t1, t2,t3,t4,t5,t6;
+--
-- Test GROUP BY matching of join columns that are type-coerced due to USING
--
create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..66f4f45953 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,53 @@ drop table t2;
drop table t3;
drop table p_t1;
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+ c int not null, d int not null,
+ unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+ c int not null, d int not null,
+ unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+
+drop table t1, t2,t3,t4,t5,t6;
--
-- Test GROUP BY matching of join columns that are type-coerced due to USING
--
--
2.34.1