(hive) branch master updated: HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION ALL Queries (#6128)

okumin Thu, 16 Oct 2025 23:09:26 -0700

This is an automated email from the ASF dual-hosted git repository.

okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 55d9ab7d6b0 HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION 
ALL Queries (#6128)
55d9ab7d6b0 is described below

commit 55d9ab7d6b00fa510be791b9de55974f61c90519
Author: Indhumathi <[email protected]>
AuthorDate: Fri Oct 17 11:37:59 2025 +0530

    HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION ALL Queries (#6128)
---
 .../hive/ql/optimizer/GroupingSetOptimizer.java    |   6 +
 .../groupingset_optimize_hive_28489.q              |  16 +++
 .../llap/groupingset_optimize_hive_28489.q.out     | 131 +++++++++++++++++++++
 3 files changed, 153 insertions(+)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
index 2ebbf048905..4563aea73dc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
@@ -182,6 +182,12 @@ private boolean isParentOpFeasible(Operator<?> parentOp) {
     }
 
     private String selectPartitionColumn(GroupByOperator gby, Operator<?> 
parentOp) {
+      if (parentOp.getColumnExprMap() == null) {
+        LOG.debug("Skip grouping-set optimization as the parent operator {} 
does not define a column " +
+                        "expression mapping", parentOp);
+        return null;
+      }
+
       if (parentOp.getSchema() == null || parentOp.getSchema().getSignature() 
== null) {
         LOG.debug("Skip grouping-set optimization as the parent operator {} 
does not provide signature",
             parentOp);
diff --git 
a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q 
b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
index a8e332808d2..b9f81f6af95 100644
--- a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
+++ b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
@@ -1,6 +1,22 @@
 -- SORT_QUERY_RESULTS
 
 create table grp_set_test (key string, value string, col0 int, col1 int, col2 
int, col3 int);
+
+-- UNION case, can't be optimized
+set hive.optimize.grouping.set.threshold=1;
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
+explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
 insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1, 
1, 1, 2, 3, 100);
 
 -- Should not be optimized
diff --git 
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out 
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
index 5d0c17d370b..4b87ccfdc52 100644
--- 
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
@@ -6,6 +6,137 @@ POSTHOOK: query: create table grp_set_test (key string, value 
string, col0 int,
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+NULL   NULL
+PREHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as 
grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Union 2 (CONTAINS)
+        Map 4 <- Union 2 (CONTAINS)
+        Reducer 3 <- Union 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: 'abc' (type: string), col2 (type: int)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: NONE
+                    Group By Operator
+                      aggregations: sum(_col1)
+                      keys: _col0 (type: string), 0L (type: bigint)
+                      grouping sets: 0, 1
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 16 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: 
bigint)
+                        null sort order: zz
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), 
_col1 (type: bigint)
+                        Statistics: Num rows: 4 Data size: 16 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: 'def' (type: string), col2 (type: int)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE 
Column stats: NONE
+                    Group By Operator
+                      aggregations: sum(_col1)
+                      keys: _col0 (type: string), 0L (type: bigint)
+                      grouping sets: 0, 1
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 16 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: 
bigint)
+                        null sort order: zz
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), 
_col1 (type: bigint)
+                        Statistics: Num rows: 4 Data size: 16 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: string), KEY._col1 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col2
+                Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: string), _col2 (type: bigint)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Union 2 
+            Vertex: Union 2
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 
2, 2, 10), (1, 1, 1, 2, 3, 100)
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table

(hive) branch master updated: HIVE-29267: Fix NPE on Grouping Sets Optimizer for UNION ALL Queries (#6128)

Reply via email to