This is an automated email from the ASF dual-hosted git repository.
sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push:
new 2058c2e0dee HIVE-27573: Backport of HIVE-21799: NullPointerException
in DynamicPartitionPruningOptimization, when join key is on aggregation column
to branch-3
2058c2e0dee is described below
commit 2058c2e0dee430f0a8bc9f4a3cadd75d5e087091
Author: Shefali Singh <[email protected]>
AuthorDate: Thu Sep 28 17:13:31 2023 +0530
HIVE-27573: Backport of HIVE-21799: NullPointerException in
DynamicPartitionPruningOptimization, when join key is on aggregation column to
branch-3
Signed-off-by: Sankar Hariappan <[email protected]>
Closes (#4556)
---
.../test/resources/testconfiguration.properties | 1 +
.../DynamicPartitionPruningOptimization.java | 13 +-
.../dynamic_semijoin_reduction_on_aggcol.q | 17 +++
.../dynamic_semijoin_reduction_on_aggcol.q.out | 149 +++++++++++++++++++++
4 files changed, 171 insertions(+), 9 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties
b/itests/src/test/resources/testconfiguration.properties
index a5bce33d74f..144a5a8ad48 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -518,6 +518,7 @@ minillaplocal.query.files=\
dynamic_semijoin_reduction_2.q,\
dynamic_semijoin_reduction_3.q,\
dynamic_semijoin_reduction_4.q,\
+ dynamic_semijoin_reduction_on_aggcol.q,\
dynamic_semijoin_reduction_sw.q,\
dynpart_sort_opt_vectorization.q,\
dynpart_sort_optimization.q,\
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index a1401aac72c..d84f10b4c38 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -576,16 +576,11 @@ public class DynamicPartitionPruningOptimization
implements NodeProcessor {
// Create the column expr map
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ExprNodeDesc exprNode = null;
- if ( parentOfRS.getColumnExprMap() != null) {
- exprNode = parentOfRS.getColumnExprMap().get(internalColName).clone();
- } else {
- exprNode = new ExprNodeColumnDesc(columnInfo);
- }
-
- if (exprNode instanceof ExprNodeColumnDesc) {
- ExprNodeColumnDesc encd = (ExprNodeColumnDesc) exprNode;
- encd.setColumn(internalColName);
+ if (columnInfo == null) {
+ LOG.debug("No ColumnInfo found in {} for {}",
parentOfRS.getOperatorId(), internalColName);
+ return false;
}
+ exprNode = new ExprNodeColumnDesc(columnInfo);
colExprMap.put(internalColName, exprNode);
// Create the Select Operator
diff --git
a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
new file mode 100644
index 00000000000..e7c8db3e778
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
@@ -0,0 +1,17 @@
+--! qt:dataset:src
+set hive.explain.user=false;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+
+create table dynamic_semijoin_reduction_on_aggcol(id int, outcome string,
eventid int) stored as orc;
+insert into dynamic_semijoin_reduction_on_aggcol select key, value, key from
src;
+
+explain select a.id, b.outcome from (select id, max(eventid) as event_id_max
from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid;
+
+select a.id, b.outcome from (select id, max(eventid) as event_id_max from
dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid;
diff --git
a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
new file mode 100644
index 00000000000..4d29456df26
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
@@ -0,0 +1,149 @@
+PREHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int,
outcome string, eventid int) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int,
outcome string, eventid int) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+PREHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key,
value, key from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key,
value, key from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.eventid EXPRESSION
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.id EXPRESSION
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.outcome SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: explain select a.id, b.outcome from (select id, max(eventid)
as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+POSTHOOK: query: explain select a.id, b.outcome from (select id, max(eventid)
as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Map 4 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: dynamic_semijoin_reduction_on_aggcol
+ Statistics: Num rows: 500 Data size: 4000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (id = 0) (type: boolean)
+ Statistics: Num rows: 2 Data size: 16 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: eventid (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 2 Data size: 16 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: max(_col1)
+ keys: 0 (type: int)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 1 Data size: 8 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: int)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: b
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: outcome (type: string), eventid (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0)
+ keys: KEY._col0 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Outer Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 1 Data size: 91 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: 0 (type: int), _col1 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 95 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select a.id, b.outcome from (select id, max(eventid) as
event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+POSTHOOK: query: select a.id, b.outcome from (select id, max(eventid) as
event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+0 val_0
+0 val_0
+0 val_0