This is an automated email from the ASF dual-hosted git repository.
okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 55d9ab7d6b0 HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION
ALL Queries (#6128)
55d9ab7d6b0 is described below
commit 55d9ab7d6b00fa510be791b9de55974f61c90519
Author: Indhumathi <[email protected]>
AuthorDate: Fri Oct 17 11:37:59 2025 +0530
HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION ALL Queries (#6128)
---
.../hive/ql/optimizer/GroupingSetOptimizer.java | 6 +
.../groupingset_optimize_hive_28489.q | 16 +++
.../llap/groupingset_optimize_hive_28489.q.out | 131 +++++++++++++++++++++
3 files changed, 153 insertions(+)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
index 2ebbf048905..4563aea73dc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
@@ -182,6 +182,12 @@ private boolean isParentOpFeasible(Operator<?> parentOp) {
}
private String selectPartitionColumn(GroupByOperator gby, Operator<?>
parentOp) {
+ if (parentOp.getColumnExprMap() == null) {
+ LOG.debug("Skip grouping-set optimization as the parent operator {}
does not define a column " +
+ "expression mapping", parentOp);
+ return null;
+ }
+
if (parentOp.getSchema() == null || parentOp.getSchema().getSignature()
== null) {
LOG.debug("Skip grouping-set optimization as the parent operator {}
does not provide signature",
parentOp);
diff --git
a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
index a8e332808d2..b9f81f6af95 100644
--- a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
+++ b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
@@ -1,6 +1,22 @@
-- SORT_QUERY_RESULTS
create table grp_set_test (key string, value string, col0 int, col1 int, col2
int, col3 int);
+
+-- UNION case, can't be optimized
+set hive.optimize.grouping.set.threshold=1;
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
+explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1,
1, 1, 2, 3, 100);
-- Should not be optimized
diff --git
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
index 5d0c17d370b..4b87ccfdc52 100644
---
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
+++
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
@@ -6,6 +6,137 @@ POSTHOOK: query: create table grp_set_test (key string, value
string, col0 int,
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+NULL NULL
+PREHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Union 2 (CONTAINS)
+ Map 4 <- Union 2 (CONTAINS)
+ Reducer 3 <- Union 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: 'abc' (type: string), col2 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: NONE
+ Group By Operator
+ aggregations: sum(_col1)
+ keys: _col0 (type: string), 0L (type: bigint)
+ grouping sets: 0, 1
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 4 Data size: 16 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type:
bigint)
+ null sort order: zz
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string),
_col1 (type: bigint)
+ Statistics: Num rows: 4 Data size: 16 Basic stats:
COMPLETE Column stats: NONE
+ value expressions: _col2 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: 'def' (type: string), col2 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: NONE
+ Group By Operator
+ aggregations: sum(_col1)
+ keys: _col0 (type: string), 0L (type: bigint)
+ grouping sets: 0, 1
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 4 Data size: 16 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type:
bigint)
+ null sort order: zz
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string),
_col1 (type: bigint)
+ Statistics: Num rows: 4 Data size: 16 Basic stats:
COMPLETE Column stats: NONE
+ value expressions: _col2 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 3
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: string), KEY._col1 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col2
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE
Column stats: NONE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: string), _col2 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE
Column stats: NONE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Union 2
+ Vertex: Union 2
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
PREHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1,
2, 2, 10), (1, 1, 1, 2, 3, 100)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table