Repository: hive Updated Branches: refs/heads/master dabb62d6e -> 217811254
HIVE-18721 : Bucket Map Join : Handle empty buckets (Deepak Jaiswal, reviewed by Gunther Hagleitner) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/21781125 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/21781125 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/21781125 Branch: refs/heads/master Commit: 21781125419826bd1ae76d980f958419a84d84f3 Parents: dabb62d Author: Deepak Jaiswal <djais...@apache.org> Authored: Thu Feb 15 12:26:07 2018 -0800 Committer: Deepak Jaiswal <djais...@apache.org> Committed: Thu Feb 15 12:26:54 2018 -0800 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../hive/ql/exec/tez/CustomPartitionEdge.java | 7 ++ .../clientpositive/bucket_map_join_tez_empty.q | 18 +++ .../llap/bucket_map_join_tez_empty.q.out | 121 +++++++++++++++++++ 4 files changed, 147 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/21781125/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index c2252f3..942f97a 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -506,6 +506,7 @@ minillaplocal.query.files=\ bucket_many.q,\ bucket_map_join_tez1.q,\ bucket_map_join_tez2.q,\ + bucket_map_join_tez_empty.q,\ bucketizedhiveinputformat.q,\ bucketmapjoin6.q,\ bucketmapjoin7.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/21781125/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionEdge.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionEdge.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionEdge.java index 1ac1d14..4248cd9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionEdge.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/CustomPartitionEdge.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.exec.tez; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -84,6 +85,12 @@ public class CustomPartitionEdge extends EdgeManagerPlugin { @Override public void routeDataMovementEventToDestination(DataMovementEvent event, int sourceTaskIndex, int sourceOutputIndex, Map<Integer, List<Integer>> mapDestTaskIndices) { + if (conf.getRoutingTable().get(sourceOutputIndex).size() == 0) { + // No task for given input, return empty list with -1 as index + mapDestTaskIndices.put(-1, new ArrayList<>()); + return; + } + // Normal case. List<Integer> outputIndices = Collections.singletonList(sourceTaskIndex); for (Integer destIndex : conf.getRoutingTable().get(sourceOutputIndex)) { mapDestTaskIndices.put(destIndex, outputIndices); http://git-wip-us.apache.org/repos/asf/hive/blob/21781125/ql/src/test/queries/clientpositive/bucket_map_join_tez_empty.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/bucket_map_join_tez_empty.q b/ql/src/test/queries/clientpositive/bucket_map_join_tez_empty.q new file mode 100644 index 0000000..cc43b5b --- /dev/null +++ b/ql/src/test/queries/clientpositive/bucket_map_join_tez_empty.q @@ -0,0 +1,18 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.auto.convert.join=true; +set hive.auto.convert.join.noconditionaltask=true; +set hive.auto.convert.join.noconditionaltask.size=10000; + +CREATE TABLE tab1(key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE; +CREATE TABLE tab2 (key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE; + + +-- HIVE-18721 : Make sure only certain buckets have data. +insert into tab1 VALUES (1,"abc"),(4,"def"),(8, "ghi"); +insert into tab2 VALUES (1, "abc"), (5, "aa"); + +set hive.convert.join.bucket.mapjoin.tez = true; + +explain select * from tab1, tab2 where tab1.key1 = tab2.key1; +select * from tab1, tab2 where tab1.key1 = tab2.key1; http://git-wip-us.apache.org/repos/asf/hive/blob/21781125/ql/src/test/results/clientpositive/llap/bucket_map_join_tez_empty.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez_empty.q.out b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez_empty.q.out new file mode 100644 index 0000000..33825da --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez_empty.q.out @@ -0,0 +1,121 @@ +PREHOOK: query: CREATE TABLE tab1(key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1 +POSTHOOK: query: CREATE TABLE tab1(key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1 +PREHOOK: query: CREATE TABLE tab2 (key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab2 +POSTHOOK: query: CREATE TABLE tab2 (key1 int, value string) CLUSTERED BY (key1) INTO 10 BUCKETS STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab2 +PREHOOK: query: insert into tab1 VALUES (1,"abc"),(4,"def"),(8, "ghi") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tab1 +POSTHOOK: query: insert into tab1 VALUES (1,"abc"),(4,"def"),(8, "ghi") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tab1 +POSTHOOK: Lineage: tab1.key1 SCRIPT [] +POSTHOOK: Lineage: tab1.value SCRIPT [] +PREHOOK: query: insert into tab2 VALUES (1, "abc"), (5, "aa") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tab2 +POSTHOOK: query: insert into tab2 VALUES (1, "abc"), (5, "aa") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tab2 +POSTHOOK: Lineage: tab2.key1 SCRIPT [] +POSTHOOK: Lineage: tab2.value SCRIPT [] +PREHOOK: query: explain select * from tab1, tab2 where tab1.key1 = tab2.key1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from tab1, tab2 where tab1.key1 = tab2.key1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (CUSTOM_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tab1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + input vertices: + 1 Map 2 + Statistics: Num rows: 2 Data size: 364 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 364 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + LLAP IO: no inputs + Map 2 + Map Operator Tree: + TableScan + alias: tab2 + Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from tab1, tab2 where tab1.key1 = tab2.key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1 +PREHOOK: Input: default@tab2 +#### A masked pattern was here #### +POSTHOOK: query: select * from tab1, tab2 where tab1.key1 = tab2.key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1 +POSTHOOK: Input: default@tab2 +#### A masked pattern was here #### +1 abc 1 abc