This is an automated email from the ASF dual-hosted git repository. okumin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new bfac94f1bcb HIVE-28798: Bucket Map Join partially using partition transforms (#5670) bfac94f1bcb is described below commit bfac94f1bcb36bba620a5a59e9c6e9a087bb6a9b Author: Shohei Okumiya <oku...@apache.org> AuthorDate: Fri Jul 18 21:24:38 2025 +0900 HIVE-28798: Bucket Map Join partially using partition transforms (#5670) --- .../queries/positive/iceberg_bucket_map_join_8.q | 111 +++- .../positive/llap/iceberg_bucket_map_join_8.q.out | 576 +++++++++++++++++---- .../annotation/OpTraitsRulesProcFactory.java | 105 ++-- 3 files changed, 636 insertions(+), 156 deletions(-) diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q index 91b30c7b231..4852a67cd35 100644 --- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q @@ -4,47 +4,110 @@ set hive.auto.convert.join=true; set hive.optimize.dynamic.partition.hashjoin=false; set hive.convert.join.bucket.mapjoin.tez=true; -CREATE TABLE srcbucket_big(key int, value string, id int) -PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG; +CREATE TABLE srcbucket_big(key1 int, key2 string, value string, id int) +PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG; INSERT INTO srcbucket_big VALUES -(101, 'val_101', 1), -(null, 'val_102', 2), -(103, 'val_103', 3), -(104, null, 4), -(105, 'val_105', 5), -(null, null, 6); - -CREATE TABLE src_small(key int, value string); +(101, '1001', 'val_101', 1), +(null, '1002', 'val_102', 2), +(103, null, 'val_103', 3), +(104, '1004', null, 4), +(105, '1005', 'val_105', 5), +(101, '1001', 'val_101', 6), +(null, '1002', 'val_102', 7), +(103, null, 'val_103', 8), +(104, '1004', null, 9), +(105, '1005', 'val_105', 10), +(101, '1001', 'val_101', 11), +(null, '1002', 'val_102', 12), +(103, null, 'val_103', 13), +(104, '1004', null, 14), +(105, '1005', 'val_105', 15), +(101, '1001', 'val_101', 16), +(null, '1002', 'val_102', 17), +(103, null, 'val_103', 18), +(104, '1004', null, 19), +(105, '1005', 'val_105', 20), +(null, null, null, 21); + +CREATE TABLE src_small(key1 int, key2 string, value string); INSERT INTO src_small VALUES -(101, 'val_101'), -(null, 'val_102'), -(103, 'val_103'), -(104, null), -(105, 'val_105'), -(null, null); +(101, '1001', 'val_101'), +(null, '1002', 'val_102'), +(103, null, 'val_103'), +(104, '1004', null), +(105, '1005', 'val_105'), +(null, null, null); SELECT * FROM srcbucket_big ORDER BY id; --- Using the bucket column +-- key1 +EXPLAIN +SELECT a.key1, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 +ORDER BY a.id; + +SELECT a.key1, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 +ORDER BY a.id; + +-- key2 EXPLAIN -SELECT * +SELECT a.key2, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key2 = b.key2 ORDER BY a.id; -SELECT * +SELECT a.key2, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key2 = b.key2 ORDER BY a.id; --- Using a non-bucket column +-- Using a non-partition column EXPLAIN -SELECT * +SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id; -SELECT * +SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id; + +-- key1 & key2 +EXPLAIN +SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id; + +SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id; + +-- key1 & non-partition column +EXPLAIN +SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id; + +SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id; + +-- key1 & key2 & non-partition column +EXPLAIN +SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id; + +SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id; diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out index 6f6ca30e874..e7a677f0628 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out @@ -1,62 +1,93 @@ -PREHOOK: query: CREATE TABLE srcbucket_big(key int, value string, id int) -PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG +PREHOOK: query: CREATE TABLE srcbucket_big(key1 int, key2 string, value string, id int) +PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@srcbucket_big -POSTHOOK: query: CREATE TABLE srcbucket_big(key int, value string, id int) -PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG +POSTHOOK: query: CREATE TABLE srcbucket_big(key1 int, key2 string, value string, id int) +PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@srcbucket_big PREHOOK: query: INSERT INTO srcbucket_big VALUES -(101, 'val_101', 1), -(null, 'val_102', 2), -(103, 'val_103', 3), -(104, null, 4), -(105, 'val_105', 5), -(null, null, 6) +(101, '1001', 'val_101', 1), +(null, '1002', 'val_102', 2), +(103, null, 'val_103', 3), +(104, '1004', null, 4), +(105, '1005', 'val_105', 5), +(101, '1001', 'val_101', 6), +(null, '1002', 'val_102', 7), +(103, null, 'val_103', 8), +(104, '1004', null, 9), +(105, '1005', 'val_105', 10), +(101, '1001', 'val_101', 11), +(null, '1002', 'val_102', 12), +(103, null, 'val_103', 13), +(104, '1004', null, 14), +(105, '1005', 'val_105', 15), +(101, '1001', 'val_101', 16), +(null, '1002', 'val_102', 17), +(103, null, 'val_103', 18), +(104, '1004', null, 19), +(105, '1005', 'val_105', 20), +(null, null, null, 21) PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@srcbucket_big POSTHOOK: query: INSERT INTO srcbucket_big VALUES -(101, 'val_101', 1), -(null, 'val_102', 2), -(103, 'val_103', 3), -(104, null, 4), -(105, 'val_105', 5), -(null, null, 6) +(101, '1001', 'val_101', 1), +(null, '1002', 'val_102', 2), +(103, null, 'val_103', 3), +(104, '1004', null, 4), +(105, '1005', 'val_105', 5), +(101, '1001', 'val_101', 6), +(null, '1002', 'val_102', 7), +(103, null, 'val_103', 8), +(104, '1004', null, 9), +(105, '1005', 'val_105', 10), +(101, '1001', 'val_101', 11), +(null, '1002', 'val_102', 12), +(103, null, 'val_103', 13), +(104, '1004', null, 14), +(105, '1005', 'val_105', 15), +(101, '1001', 'val_101', 16), +(null, '1002', 'val_102', 17), +(103, null, 'val_103', 18), +(104, '1004', null, 19), +(105, '1005', 'val_105', 20), +(null, null, null, 21) POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@srcbucket_big -PREHOOK: query: CREATE TABLE src_small(key int, value string) +PREHOOK: query: CREATE TABLE src_small(key1 int, key2 string, value string) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@src_small -POSTHOOK: query: CREATE TABLE src_small(key int, value string) +POSTHOOK: query: CREATE TABLE src_small(key1 int, key2 string, value string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@src_small PREHOOK: query: INSERT INTO src_small VALUES -(101, 'val_101'), -(null, 'val_102'), -(103, 'val_103'), -(104, null), -(105, 'val_105'), -(null, null) +(101, '1001', 'val_101'), +(null, '1002', 'val_102'), +(103, null, 'val_103'), +(104, '1004', null), +(105, '1005', 'val_105'), +(null, null, null) PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@src_small POSTHOOK: query: INSERT INTO src_small VALUES -(101, 'val_101'), -(null, 'val_102'), -(103, 'val_103'), -(104, null), -(105, 'val_105'), -(null, null) +(101, '1001', 'val_101'), +(null, '1002', 'val_102'), +(103, null, 'val_103'), +(104, '1004', null), +(105, '1005', 'val_105'), +(null, null, null) POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@src_small -POSTHOOK: Lineage: src_small.key SCRIPT [] +POSTHOOK: Lineage: src_small.key1 SCRIPT [] +POSTHOOK: Lineage: src_small.key2 SCRIPT [] POSTHOOK: Lineage: src_small.value SCRIPT [] PREHOOK: query: SELECT * FROM srcbucket_big ORDER BY id PREHOOK: type: QUERY @@ -66,25 +97,40 @@ POSTHOOK: query: SELECT * FROM srcbucket_big ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@srcbucket_big #### A masked pattern was here #### -101 val_101 1 -NULL val_102 2 -103 val_103 3 -104 NULL 4 -105 val_105 5 -NULL NULL 6 +101 1001 val_101 1 +NULL 1002 val_102 2 +103 NULL val_103 3 +104 1004 NULL 4 +105 1005 val_105 5 +101 1001 val_101 6 +NULL 1002 val_102 7 +103 NULL val_103 8 +104 1004 NULL 9 +105 1005 val_105 10 +101 1001 val_101 11 +NULL 1002 val_102 12 +103 NULL val_103 13 +104 1004 NULL 14 +105 1005 val_105 15 +101 1001 val_101 16 +NULL 1002 val_102 17 +103 NULL val_103 18 +104 1004 NULL 19 +105 1005 val_105 20 +NULL NULL NULL 21 PREHOOK: query: EXPLAIN -SELECT * +SELECT a.key1, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key1 = b.key1 ORDER BY a.id PREHOOK: type: QUERY PREHOOK: Input: default@src_small PREHOOK: Input: default@srcbucket_big #### A masked pattern was here #### POSTHOOK: query: EXPLAIN -SELECT * +SELECT a.key1, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key1 = b.key1 ORDER BY a.id POSTHOOK: type: QUERY POSTHOOK: Input: default@src_small @@ -93,59 +139,155 @@ POSTHOOK: Input: default@srcbucket_big Plan optimized by CBO. Vertex dependency in root stage -Map 2 <- Map 1 (BROADCAST_EDGE) -Reducer 3 <- Map 2 (SIMPLE_EDGE) +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 - Reducer 3 vectorized, llap + Reducer 2 vectorized, llap File Output Operator [FS_37] - Select Operator [SEL_36] (rows=4 width=192) - Output:["_col0","_col1","_col2","_col3","_col4"] - <-Map 2 [SIMPLE_EDGE] vectorized, llap + Select Operator [SEL_36] (rows=16 width=8) + Output:["_col0","_col1"] + <-Map 1 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_35] - Map Join Operator [MAPJOIN_34] (rows=4 width=192) - Conds:RS_31._col0=SEL_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] - <-Map 1 [BROADCAST_EDGE] vectorized, llap - BROADCAST [RS_31] + Map Join Operator [MAPJOIN_34] (rows=16 width=8) + BucketMapJoin:true,Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_31] PartitionCols:_col0 - Select Operator [SEL_30] (rows=4 width=99) - Output:["_col0","_col1","_col2"] - Filter Operator [FIL_29] (rows=4 width=99) - predicate:key is not null - TableScan [TS_0] (rows=4 width=99) - default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value","id"] - <-Select Operator [SEL_33] (rows=4 width=93) + Select Operator [SEL_30] (rows=4 width=4) + Output:["_col0"] + Filter Operator [FIL_29] (rows=4 width=4) + predicate:key1 is not null + TableScan [TS_3] (rows=6 width=3) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1"] + <-Select Operator [SEL_33] (rows=16 width=8) + Output:["_col0","_col1"] + Filter Operator [FIL_32] (rows=16 width=8) + predicate:key1 is not null + TableScan [TS_0] (rows=16 width=8) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","id"] + +PREHOOK: query: SELECT a.key1, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key1, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +101 1 +103 3 +104 4 +105 5 +101 6 +103 8 +104 9 +105 10 +101 11 +103 13 +104 14 +105 15 +101 16 +103 18 +104 19 +105 20 +PREHOOK: query: EXPLAIN +SELECT a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key2 = b.key2 +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key2 = b.key2 +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_37] + Select Operator [SEL_36] (rows=16 width=92) + Output:["_col0","_col1"] + <-Map 1 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_35] + Map Join Operator [MAPJOIN_34] (rows=16 width=92) + BucketMapJoin:true,Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_31] + PartitionCols:_col0 + Select Operator [SEL_30] (rows=4 width=87) + Output:["_col0"] + Filter Operator [FIL_29] (rows=4 width=87) + predicate:key2 is not null + TableScan [TS_3] (rows=6 width=72) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key2"] + <-Select Operator [SEL_33] (rows=16 width=92) Output:["_col0","_col1"] - Filter Operator [FIL_32] (rows=4 width=93) - predicate:key is not null - TableScan [TS_3] (rows=6 width=77) - default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + Filter Operator [FIL_32] (rows=16 width=92) + predicate:key2 is not null + TableScan [TS_0] (rows=16 width=92) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key2"],Output:["key2","id"] -PREHOOK: query: SELECT * +PREHOOK: query: SELECT a.key2, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key2 = b.key2 ORDER BY a.id PREHOOK: type: QUERY PREHOOK: Input: default@src_small PREHOOK: Input: default@srcbucket_big #### A masked pattern was here #### -POSTHOOK: query: SELECT * +POSTHOOK: query: SELECT a.key2, a.id FROM srcbucket_big a -JOIN src_small b ON a.key = b.key +JOIN src_small b ON a.key2 = b.key2 ORDER BY a.id POSTHOOK: type: QUERY POSTHOOK: Input: default@src_small POSTHOOK: Input: default@srcbucket_big #### A masked pattern was here #### -101 val_101 1 101 val_101 -103 val_103 3 103 val_103 -104 NULL 4 104 NULL -105 val_105 5 105 val_105 +1001 1 +1002 2 +1004 4 +1005 5 +1001 6 +1002 7 +1004 9 +1005 10 +1001 11 +1002 12 +1004 14 +1005 15 +1001 16 +1002 17 +1004 19 +1005 20 PREHOOK: query: EXPLAIN -SELECT * +SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id @@ -154,7 +296,7 @@ PREHOOK: Input: default@src_small PREHOOK: Input: default@srcbucket_big #### A masked pattern was here #### POSTHOOK: query: EXPLAIN -SELECT * +SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id @@ -174,29 +316,29 @@ Stage-0 Stage-1 Reducer 2 vectorized, llap File Output Operator [FS_37] - Select Operator [SEL_36] (rows=4 width=192) - Output:["_col0","_col1","_col2","_col3","_col4"] + Select Operator [SEL_36] (rows=16 width=77) + Output:["_col0","_col1"] <-Map 1 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_35] - Map Join Operator [MAPJOIN_34] (rows=4 width=192) - Conds:SEL_33._col1=RS_31._col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] + Map Join Operator [MAPJOIN_34] (rows=16 width=77) + Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"] <-Map 3 [BROADCAST_EDGE] vectorized, llap BROADCAST [RS_31] - PartitionCols:_col1 - Select Operator [SEL_30] (rows=4 width=93) - Output:["_col0","_col1"] - Filter Operator [FIL_29] (rows=4 width=93) + PartitionCols:_col0 + Select Operator [SEL_30] (rows=4 width=89) + Output:["_col0"] + Filter Operator [FIL_29] (rows=4 width=89) predicate:value is not null - TableScan [TS_3] (rows=6 width=77) - default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] - <-Select Operator [SEL_33] (rows=4 width=99) - Output:["_col0","_col1","_col2"] - Filter Operator [FIL_32] (rows=4 width=99) + TableScan [TS_3] (rows=6 width=74) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["value"] + <-Select Operator [SEL_33] (rows=16 width=77) + Output:["_col0","_col1"] + Filter Operator [FIL_32] (rows=16 width=77) predicate:value is not null - TableScan [TS_0] (rows=6 width=83) - default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value","id"] + TableScan [TS_0] (rows=21 width=77) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["value","id"] -PREHOOK: query: SELECT * +PREHOOK: query: SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id @@ -204,7 +346,7 @@ PREHOOK: type: QUERY PREHOOK: Input: default@src_small PREHOOK: Input: default@srcbucket_big #### A masked pattern was here #### -POSTHOOK: query: SELECT * +POSTHOOK: query: SELECT a.value, a.id FROM srcbucket_big a JOIN src_small b ON a.value = b.value ORDER BY a.id @@ -212,7 +354,255 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@src_small POSTHOOK: Input: default@srcbucket_big #### A masked pattern was here #### -101 val_101 1 101 val_101 -NULL val_102 2 NULL val_102 -103 val_103 3 103 val_103 -105 val_105 5 105 val_105 +val_101 1 +val_102 2 +val_103 3 +val_105 5 +val_101 6 +val_102 7 +val_103 8 +val_105 10 +val_101 11 +val_102 12 +val_103 13 +val_105 15 +val_101 16 +val_102 17 +val_103 18 +val_105 20 +PREHOOK: query: EXPLAIN +SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_57] + Select Operator [SEL_56] (rows=12 width=96) + Output:["_col0","_col1","_col2"] + <-Map 1 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_55] + Map Join Operator [MAPJOIN_54] (rows=12 width=96) + BucketMapJoin:true,Conds:SEL_53._col0, _col1=RS_51._col0, _col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_51] + PartitionCols:_col0, _col1 + Select Operator [SEL_50] (rows=3 width=91) + Output:["_col0","_col1"] + Filter Operator [FIL_49] (rows=3 width=91) + predicate:(key1 is not null and key2 is not null) + TableScan [TS_3] (rows=6 width=75) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2"] + <-Select Operator [SEL_53] (rows=12 width=96) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_52] (rows=12 width=96) + predicate:(key1 is not null and key2 is not null) + TableScan [TS_0] (rows=12 width=96) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:32,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","id"] + +PREHOOK: query: SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key1, a.key2, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +101 1001 1 +104 1004 4 +105 1005 5 +101 1001 6 +104 1004 9 +105 1005 10 +101 1001 11 +104 1004 14 +105 1005 15 +101 1001 16 +104 1004 19 +105 1005 20 +PREHOOK: query: EXPLAIN +SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_57] + Select Operator [SEL_56] (rows=9 width=78) + Output:["_col0","_col1","_col2"] + <-Map 1 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_55] + Map Join Operator [MAPJOIN_54] (rows=9 width=78) + BucketMapJoin:true,Conds:SEL_53._col0, _col1=RS_51._col0, _col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_51] + PartitionCols:_col0 + Select Operator [SEL_50] (rows=3 width=93) + Output:["_col0","_col1"] + Filter Operator [FIL_49] (rows=3 width=93) + predicate:(key1 is not null and value is not null) + TableScan [TS_3] (rows=6 width=77) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","value"] + <-Select Operator [SEL_53] (rows=12 width=83) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_52] (rows=12 width=83) + predicate:(value is not null and key1 is not null) + TableScan [TS_0] (rows=16 width=81) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","value","id"] + +PREHOOK: query: SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key1, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +101 val_101 1 +103 val_103 3 +105 val_105 5 +101 val_101 6 +103 val_103 8 +105 val_105 10 +101 val_101 11 +103 val_103 13 +105 val_105 15 +101 val_101 16 +103 val_103 18 +105 val_105 20 +PREHOOK: query: EXPLAIN +SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_67] + Select Operator [SEL_66] (rows=5 width=150) + Output:["_col0","_col1","_col2","_col3"] + <-Map 1 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_65] + Map Join Operator [MAPJOIN_64] (rows=5 width=150) + BucketMapJoin:true,Conds:SEL_63._col0, _col1, _col2=RS_61._col0, _col1, _col2(Inner),Output:["_col0","_col1","_col2","_col3"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_61] + PartitionCols:_col0, _col1 + Select Operator [SEL_60] (rows=2 width=180) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_59] (rows=2 width=180) + predicate:(key1 is not null and key2 is not null and value is not null) + TableScan [TS_3] (rows=6 width=150) + default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"] + <-Select Operator [SEL_63] (rows=8 width=164) + Output:["_col0","_col1","_col2","_col3"] + Filter Operator [FIL_62] (rows=8 width=164) + predicate:(value is not null and key1 is not null and key2 is not null) + TableScan [TS_0] (rows=12 width=164) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:32,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","value","id"] + +PREHOOK: query: SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id +PREHOOK: type: QUERY +PREHOOK: Input: default@src_small +PREHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key1, a.key2, a.value, a.id +FROM srcbucket_big a +JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value +ORDER BY a.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src_small +POSTHOOK: Input: default@srcbucket_big +#### A masked pattern was here #### +101 1001 val_101 1 +105 1005 val_105 5 +101 1001 val_101 6 +105 1005 val_105 10 +101 1001 val_101 11 +105 1005 val_105 15 +101 1001 val_101 16 +105 1005 val_105 20 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java index 7d91d7ea216..bf149397efe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java @@ -381,45 +381,82 @@ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, public static class SelectRule implements SemanticNodeProcessor { // For bucket columns - // If all the columns match to the parent, put them in the bucket cols + // If the projected columns are compatible with the bucketing requirement, put them in the bucket cols // else, add empty list. + private void putConvertedColNamesForBucket( + List<List<String>> parentColNamesList, List<CustomBucketFunction> parentBucketFunctions, SelectOperator selOp, + List<List<String>> newBucketColNamesList, List<CustomBucketFunction> newBucketFunctions) { + Preconditions.checkState(parentColNamesList.size() == parentBucketFunctions.size()); + for (int i = 0; i < parentColNamesList.size(); i++) { + List<String> colNames = parentColNamesList.get(i); + + List<String> newBucketColNames = new ArrayList<>(); + boolean[] retainedColumns = new boolean[colNames.size()]; + boolean allFound = true; + for (int j = 0; j < colNames.size(); j++) { + final String colName = colNames.get(j); + Optional<String> newColName = resolveNewColName(colName, selOp); + if (newColName.isPresent()) { + retainedColumns[j] = true; + newBucketColNames.add(newColName.get()); + } else { + retainedColumns[j] = false; + allFound = false; + } + } + + CustomBucketFunction bucketFunction = parentBucketFunctions.get(i); + if (allFound) { + newBucketColNamesList.add(newBucketColNames); + newBucketFunctions.add(bucketFunction); + } else if (bucketFunction == null) { + // Hive's native bucketing is effective only when all the bucketing columns are used + newBucketColNamesList.add(new ArrayList<>()); + newBucketFunctions.add(null); + } else { + Optional<CustomBucketFunction> newBucketFunction = bucketFunction.select(retainedColumns); + if (newBucketFunction.isPresent()) { + newBucketColNamesList.add(newBucketColNames); + newBucketFunctions.add(newBucketFunction.get()); + } else { + newBucketColNamesList.add(new ArrayList<>()); + newBucketFunctions.add(null); + } + } + } + } + // For sort columns // Keep the subset of all the columns as long as order is maintained. - public List<List<String>> getConvertedColNames( - List<List<String>> parentColNames, SelectOperator selOp, boolean processSortCols) { + private List<List<String>> getConvertedColNamesForSort(List<List<String>> parentColNames, SelectOperator selOp) { List<List<String>> listBucketCols = new ArrayList<>(); for (List<String> colNames : parentColNames) { List<String> bucketColNames = new ArrayList<>(); - boolean found = false; for (String colName : colNames) { - // Reset found - found = false; - for (Entry<String, ExprNodeDesc> entry : selOp.getColumnExprMap().entrySet()) { - if ((entry.getValue() instanceof ExprNodeColumnDesc) && - (((ExprNodeColumnDesc) (entry.getValue())).getColumn().equals(colName))) { - bucketColNames.add(entry.getKey()); - found = true; - break; - } - } - if (!found) { + Optional<String> newColName = resolveNewColName(colName, selOp); + if (newColName.isPresent()) { + bucketColNames.add(newColName.get()); + } else { // Bail out on first missed column. break; } } - if (!processSortCols && !found) { - // While processing bucket columns, atleast one bucket column - // missed. This results in a different bucketing scheme. - // Add empty list - listBucketCols.add(new ArrayList<>()); - } else { - listBucketCols.add(bucketColNames); - } + listBucketCols.add(bucketColNames); } return listBucketCols; } + private Optional<String> resolveNewColName(String parentColName, SelectOperator selOp) { + for (Entry<String, ExprNodeDesc> entry : selOp.getColumnExprMap().entrySet()) { + if ((entry.getValue() instanceof ExprNodeColumnDesc) && + (((ExprNodeColumnDesc) (entry.getValue())).getColumn().equals(parentColName))) { + return Optional.of(entry.getKey()); + } + } + return Optional.empty(); + } + @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { @@ -428,28 +465,18 @@ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, List<List<String>> parentBucketColNames = parentOpTraits.getBucketColNames(); List<List<String>> listBucketCols = null; + List<CustomBucketFunction> bucketFunctions = null; List<List<String>> listSortCols = null; if (selOp.getColumnExprMap() != null) { if (parentBucketColNames != null) { - listBucketCols = getConvertedColNames(parentBucketColNames, selOp, false); + listBucketCols = new ArrayList<>(); + bucketFunctions = new ArrayList<>(); + putConvertedColNamesForBucket(parentBucketColNames, parentOpTraits.getCustomBucketFunctions(), selOp, + listBucketCols, bucketFunctions); } List<List<String>> parentSortColNames = parentOpTraits.getSortCols(); if (parentSortColNames != null) { - listSortCols = getConvertedColNames(parentSortColNames, selOp, true); - } - } - - List<CustomBucketFunction> bucketFunctions = null; - if (listBucketCols != null) { - Preconditions.checkState(parentBucketColNames.size() == listBucketCols.size()); - bucketFunctions = new ArrayList<>(); - for (int i = 0; i < listBucketCols.size(); i++) { - if (listBucketCols.get(i).isEmpty()) { - bucketFunctions.add(null); - } else { - Preconditions.checkState(listBucketCols.get(i).size() == parentBucketColNames.get(i).size()); - bucketFunctions.add(parentOpTraits.getCustomBucketFunctions().get(i)); - } + listSortCols = getConvertedColNamesForSort(parentSortColNames, selOp); } }