This is an automated email from the ASF dual-hosted git repository.
zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new b7a3e8b87d9 HIVE-28675: Maximize the removal of redundant columns from
GROUP BY clauses (Stamatis Zampetakis reviewed by Soumyakanti Das, Ramesh Kumar)
b7a3e8b87d9 is described below
commit b7a3e8b87d909edd7fc4f939ff87bbadddd61c06
Author: Stamatis Zampetakis <[email protected]>
AuthorDate: Fri Dec 20 17:14:42 2024 +0100
HIVE-28675: Maximize the removal of redundant columns from GROUP BY clauses
(Stamatis Zampetakis reviewed by Soumyakanti Das, Ramesh Kumar)
Enhance HiveRelFieldTrimmer to remove the maximum number of redundant
columns from the GROUP BY clause.
The optimization has the following benefits:
1. Generate more efficient plans by pruning as many columns as possible
(less CPU/IO/network cost).
2. Avoid missing optimization opportunities by examining all candidates.
Close apache/hive#5586
---
.../calcite/rules/HiveRelFieldTrimmer.java | 26 ++---
.../clientpositive/cbo_groupby_remove_key.q | 18 ++++
.../llap/cbo_groupby_remove_key.q.out | 112 +++++++++++++++++++++
3 files changed, 137 insertions(+), 19 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java
index e7dd8bf2b1d..08e98a467b2 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelFieldTrimmer.java
@@ -502,28 +502,16 @@ public class HiveRelFieldTrimmer extends RelFieldTrimmer {
return generateGroupSetIfCardinalitySame(aggregate, originalGroupSet,
fieldsUsed);
}
- // we have set of unique key, get to the key which is same as group by key
- ImmutableBitSet groupByUniqueKey = null;
-
+ // Find the maximum number of columns that can be removed by retaining a
certain unique key
+ ImmutableBitSet columnsToRemove = ImmutableBitSet.of();
+ final ImmutableBitSet unusedGroupingColumns =
aggregate.getGroupSet().except(fieldsUsed);
for (ImmutableBitSet key : uniqueKeys) {
- if (aggregate.getGroupSet().contains(key)) {
- groupByUniqueKey = key;
- break;
+ ImmutableBitSet removeCandidate = unusedGroupingColumns.except(key);
+ if (aggregate.getGroupSet().contains(key) &&
removeCandidate.cardinality() > columnsToRemove.cardinality()) {
+ columnsToRemove = removeCandidate;
}
}
-
- if (groupByUniqueKey == null) {
- // group by keys do not represent unique keys
- return originalGroupSet;
- }
-
- // we know group by key contains primary key and there is at least one
column in group by which is not being used
- // if that column is not part of key it should be removed
- ImmutableBitSet nonKeyColumns =
aggregate.getGroupSet().except(groupByUniqueKey);
- ImmutableBitSet columnsToRemove = nonKeyColumns.except(fieldsUsed);
- ImmutableBitSet newGroupSet =
aggregate.getGroupSet().except(columnsToRemove);
-
- return newGroupSet;
+ return aggregate.getGroupSet().except(columnsToRemove);
}
/**
diff --git a/ql/src/test/queries/clientpositive/cbo_groupby_remove_key.q
b/ql/src/test/queries/clientpositive/cbo_groupby_remove_key.q
new file mode 100644
index 00000000000..2e212d24bf8
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/cbo_groupby_remove_key.q
@@ -0,0 +1,18 @@
+CREATE TABLE passenger
+(
+ id INT NOT NULL,
+ fname STRING NOT NULL,
+ lname STRING NOT NULL,
+ passport STRING NOT NULL,
+ UNIQUE (id) DISABLE RELY,
+ UNIQUE (passport) DISABLE RELY,
+ UNIQUE (fname, lname) DISABLE RELY
+);
+
+EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, passport;
+EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, passport;
+EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, fname, lname,
passport;
+EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, fname,
lname, passport;
+EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY id, fname, lname,
passport;
+EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY id, fname, lname,
passport;
+EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger GROUP BY id, fname,
lname, passport;
diff --git
a/ql/src/test/results/clientpositive/llap/cbo_groupby_remove_key.q.out
b/ql/src/test/results/clientpositive/llap/cbo_groupby_remove_key.q.out
new file mode 100644
index 00000000000..1ff5e906c1d
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/cbo_groupby_remove_key.q.out
@@ -0,0 +1,112 @@
+PREHOOK: query: CREATE TABLE passenger
+(
+ id INT NOT NULL,
+ fname STRING NOT NULL,
+ lname STRING NOT NULL,
+ passport STRING NOT NULL,
+ UNIQUE (id) DISABLE RELY,
+ UNIQUE (passport) DISABLE RELY,
+ UNIQUE (fname, lname) DISABLE RELY
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@passenger
+POSTHOOK: query: CREATE TABLE passenger
+(
+ id INT NOT NULL,
+ fname STRING NOT NULL,
+ lname STRING NOT NULL,
+ passport STRING NOT NULL,
+ UNIQUE (id) DISABLE RELY,
+ UNIQUE (passport) DISABLE RELY,
+ UNIQUE (fname, lname) DISABLE RELY
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@passenger
+PREHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id,
passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id,
passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{0}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY
id, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY
id, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{3}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id,
fname, lname, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id,
fname, lname, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{0}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY
id, fname, lname, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY
id, fname, lname, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{3}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY id,
fname, lname, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY
id, fname, lname, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(fname=[$1], _o__c1=[$2])
+ HiveAggregate(group=[{0, 1}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY id,
fname, lname, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY
id, fname, lname, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(lname=[$1], _o__c1=[$2])
+ HiveAggregate(group=[{0, 2}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+
+PREHOOK: query: EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger GROUP
BY id, fname, lname, passport
+PREHOOK: type: QUERY
+PREHOOK: Input: default@passenger
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger
GROUP BY id, fname, lname, passport
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@passenger
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{1, 2}], agg#0=[count()])
+ HiveTableScan(table=[[default, passenger]], table:alias=[passenger])
+