This is an automated email from the ASF dual-hosted git repository.
jdere pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new f229faf HIVE-21799: NullPointerException in
DynamicPartitionPruningOptimization, when join key is on aggregation column
(Jason Dere, reviewed by Vineet Garg)
f229faf is described below
commit f229fafcbdb9b9f1e48df74b35f0199ae6a193f8
Author: Jason Dere <[email protected]>
AuthorDate: Wed Jun 12 10:22:51 2019 -0700
HIVE-21799: NullPointerException in DynamicPartitionPruningOptimization,
when join key is on aggregation column (Jason Dere, reviewed by Vineet Garg)
---
.../test/resources/testconfiguration.properties | 1 +
.../DynamicPartitionPruningOptimization.java | 13 +-
.../dynamic_semijoin_reduction_on_aggcol.q | 17 ++
.../dynamic_semijoin_reduction_on_aggcol.q.out | 183 +++++++++++++++++++++
4 files changed, 205 insertions(+), 9 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties
b/itests/src/test/resources/testconfiguration.properties
index f1a27a2..3f1ce7c 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -517,6 +517,7 @@ minillaplocal.query.files=\
dynamic_semijoin_reduction_2.q,\
dynamic_semijoin_reduction_3.q,\
dynamic_semijoin_reduction_4.q,\
+ dynamic_semijoin_reduction_on_aggcol.q,\
dynamic_semijoin_reduction_sw.q,\
dynpart_sort_opt_vectorization.q,\
dynpart_sort_optimization.q,\
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index 189c68d..6686273 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -604,16 +604,11 @@ public class DynamicPartitionPruningOptimization
implements NodeProcessor {
// Create the column expr map
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ExprNodeDesc exprNode = null;
- if ( parentOfRS.getColumnExprMap() != null) {
- exprNode = parentOfRS.getColumnExprMap().get(internalColName).clone();
- } else {
- exprNode = new ExprNodeColumnDesc(columnInfo);
- }
-
- if (exprNode instanceof ExprNodeColumnDesc) {
- ExprNodeColumnDesc encd = (ExprNodeColumnDesc) exprNode;
- encd.setColumn(internalColName);
+ if (columnInfo == null) {
+ LOG.debug("No ColumnInfo found in {} for {}",
parentOfRS.getOperatorId(), internalColName);
+ return false;
}
+ exprNode = new ExprNodeColumnDesc(columnInfo);
colExprMap.put(internalColName, exprNode);
// Create the Select Operator
diff --git
a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
new file mode 100644
index 0000000..e7c8db3
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_on_aggcol.q
@@ -0,0 +1,17 @@
+--! qt:dataset:src
+set hive.explain.user=false;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+
+create table dynamic_semijoin_reduction_on_aggcol(id int, outcome string,
eventid int) stored as orc;
+insert into dynamic_semijoin_reduction_on_aggcol select key, value, key from
src;
+
+explain select a.id, b.outcome from (select id, max(eventid) as event_id_max
from dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid;
+
+select a.id, b.outcome from (select id, max(eventid) as event_id_max from
dynamic_semijoin_reduction_on_aggcol where id = 0 group by id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid;
diff --git
a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
new file mode 100644
index 0000000..9f2af56
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction_on_aggcol.q.out
@@ -0,0 +1,183 @@
+PREHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int,
outcome string, eventid int) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: query: create table dynamic_semijoin_reduction_on_aggcol(id int,
outcome string, eventid int) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+PREHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key,
value, key from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: query: insert into dynamic_semijoin_reduction_on_aggcol select key,
value, key from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dynamic_semijoin_reduction_on_aggcol
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.eventid EXPRESSION
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.id EXPRESSION
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: dynamic_semijoin_reduction_on_aggcol.outcome SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: explain select a.id, b.outcome from (select id, max(eventid)
as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+POSTHOOK: query: explain select a.id, b.outcome from (select id, max(eventid)
as event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 5 <- Reducer 4 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Map 5 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+ Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: dynamic_semijoin_reduction_on_aggcol
+ filterExpr: (id = 0) (type: boolean)
+ Statistics: Num rows: 500 Data size: 4000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (id = 0) (type: boolean)
+ Statistics: Num rows: 2 Data size: 16 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: eventid (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 2 Data size: 16 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: max(_col1)
+ keys: true (type: boolean)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: boolean)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: boolean)
+ Statistics: Num rows: 1 Data size: 8 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: int)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: b
+ filterExpr: (eventid is not null and (eventid BETWEEN
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_min) AND
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_max) and
in_bloom_filter(eventid,
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_bloom_filter))))
(type: boolean)
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((eventid BETWEEN
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_min) AND
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_max) and
in_bloom_filter(eventid,
DynamicValue(RS_11_dynamic_semijoin_reduction_on_aggcol__col1_bloom_filter)))
and eventid is not null) (type: boolean)
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: outcome (type: string), eventid (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 47500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 500 Data size: 47500 Basic
stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0)
+ keys: KEY._col0 (type: boolean)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE
Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0),
bloom_filter(_col0, expectedEntries=1)
+ minReductionHashAggr: 0.0
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type:
int), _col2 (type: binary)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Outer Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: 0 (type: int), _col1 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1),
bloom_filter(VALUE._col2, expectedEntries=1)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE
Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select a.id, b.outcome from (select id, max(eventid) as
event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+POSTHOOK: query: select a.id, b.outcome from (select id, max(eventid) as
event_id_max from dynamic_semijoin_reduction_on_aggcol where id = 0 group by
id) a
+LEFT OUTER JOIN dynamic_semijoin_reduction_on_aggcol b
+on a.event_id_max = b.eventid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dynamic_semijoin_reduction_on_aggcol
+#### A masked pattern was here ####
+0 val_0
+0 val_0
+0 val_0