This is an automated email from the ASF dual-hosted git repository.
sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push:
new 642c72be666 HIVE-27251: Backport HIVE-22121: Turning on
hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V)
642c72be666 is described below
commit 642c72be6663da860f28d3c50ded5d9ce25a01f1
Author: Aman Raj <[email protected]>
AuthorDate: Wed Jun 21 14:10:34 2023 +0530
HIVE-27251: Backport HIVE-22121: Turning on hive.tez.bucket.pruning produce
wrong results (Vineet Garg, reviewed by Gopal V)
Signed-off-by: Sankar Hariappan <[email protected]>
Closes (#4439)
---
.../ql/optimizer/FixedBucketPruningOptimizer.java | 4 +
.../clientpositive/tez_fixed_bucket_pruning.q | 14 +
.../llap/tez_fixed_bucket_pruning.q.out | 345 +++++++++++++++++++++
3 files changed, 363 insertions(+)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
index 334b8e9babc..fff0904d844 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
@@ -125,6 +125,7 @@ public class FixedBucketPruningOptimizer extends Transform {
for (StructField fs : tbl.getFields()) {
if(fs.getFieldName().equals(bucketCol)) {
bucketField = fs;
+ break;
}
}
Preconditions.checkArgument(bucketField != null);
@@ -200,6 +201,9 @@ public class FixedBucketPruningOptimizer extends Transform {
return;
}
}
+ } else if (expr.getOperator() == Operator.NOT) {
+ // TODO: think we can handle NOT IS_NULL?
+ return;
}
// invariant: bucket-col IN literals of type bucketField
BitSet bs = new BitSet(numBuckets);
diff --git a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
index cbc39977da1..21d5907d7c3 100644
--- a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
+++ b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
@@ -221,3 +221,17 @@ where DW.PROJECT_OBJECT_ID =7147200
order by DW.PROJECT_OBJECT_ID, PLAN_KEY, PROJECT_KEY
limit 5;
+CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` string)
+ CLUSTERED BY (col_1) INTO 4 BUCKETS;
+
+insert into test_table values(1, 'one', 'ONE'), (2, 'two', 'TWO'),
(3,'three','THREE'),(4,'four','FOUR');
+
+select * from test_table;
+
+explain extended select col_1, col_2, col_3 from test_table where col_1 <> 2
order by col_2;
+select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2;
+
+explain extended select col_1, col_2, col_3 from test_table where col_1 = 2
order by col_2;
+select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2;
+
+drop table `test_table`;
\ No newline at end of file
diff --git
a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
index 26f741e2fe8..9241bea347c 100644
--- a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
+++ b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
@@ -1459,3 +1459,348 @@ POSTHOOK: Input: default@l3_monthly_dw_dimplan
7147200 195775 27114
7147200 234349 27114
7147200 350519 27114
+PREHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3`
string)
+ CLUSTERED BY (col_1) INTO 4 BUCKETS
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_table
+POSTHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3`
string)
+ CLUSTERED BY (col_1) INTO 4 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_table
+PREHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two',
'TWO'), (3,'three','THREE'),(4,'four','FOUR')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_table
+POSTHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two',
'TWO'), (3,'three','THREE'),(4,'four','FOUR')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_table
+POSTHOOK: Lineage: test_table.col_1 SCRIPT []
+POSTHOOK: Lineage: test_table.col_2 SCRIPT []
+POSTHOOK: Lineage: test_table.col_3 SCRIPT []
+PREHOOK: query: select * from test_table
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test_table
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+2 two TWO
+4 four FOUR
+1 one ONE
+3 three THREE
+PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table
where col_1 <> 2 order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table
where col_1 <> 2 order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT `col_1`, `col_2`, `col_3`
+FROM `default`.`test_table`
+WHERE `col_1` <> 2
+ORDER BY `col_2`
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: test_table
+ filterExpr: (col_1 <> 2) (type: boolean)
+ Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE
Column stats: COMPLETE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: (col_1 <> 2) (type: boolean)
+ Statistics: Num rows: 4 Data size: 720 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: col_1 (type: int), col_2 (type: string),
col_3 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 4 Data size: 720 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: string)
+ null sort order: z
+ sort order: +
+ Statistics: Num rows: 4 Data size: 720 Basic stats:
COMPLETE Column stats: COMPLETE
+ tag: -1
+ value expressions: _col0 (type: int), _col2 (type:
string)
+ auto parallelism: false
+ Execution mode: vectorized, llap
+ LLAP IO: no inputs
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: test_table
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+ bucket_count 4
+ bucket_field_name col_1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns col_1,col_2,col_3
+ columns.comments
+ columns.types int:string:string
+#### A masked pattern was here ####
+ name default.test_table
+ numFiles 3
+ numRows 4
+ rawDataSize 42
+ serialization.ddl struct test_table { i32 col_1, string
col_2, string col_3}
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 46
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+ bucket_count 4
+ bucket_field_name col_1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns col_1,col_2,col_3
+ columns.comments
+ columns.types int:string:string
+#### A masked pattern was here ####
+ name default.test_table
+ numFiles 3
+ numRows 4
+ rawDataSize 42
+ serialization.ddl struct test_table { i32 col_1, string
col_2, string col_3}
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 46
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.test_table
+ name: default.test_table
+ Truncated Path -> Alias:
+ /test_table [test_table]
+ Reducer 2
+ Execution mode: vectorized, llap
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE._col0 (type: int), KEY.reducesinkkey0
(type: string), VALUE._col1 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE
Column stats: COMPLETE
+#### A masked pattern was here ####
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:string
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels
true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2
order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2
order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+4 four FOUR
+1 one ONE
+3 three THREE
+PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table
where col_1 = 2 order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table
where col_1 = 2 order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT CAST(2 AS INTEGER) AS `col_1`, `col_2`, `col_3`
+FROM (SELECT `col_2`, `col_3`
+FROM `default`.`test_table`
+WHERE `col_1` = 2
+ORDER BY `col_2`) AS `t1`
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: test_table
+ filterExpr: (col_1 = 2) (type: boolean)
+ buckets included: [0,] of 4
+ Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE
Column stats: COMPLETE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: (col_1 = 2) (type: boolean)
+ Statistics: Num rows: 1 Data size: 180 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: col_2 (type: string), col_3 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 176 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Statistics: Num rows: 1 Data size: 176 Basic stats:
COMPLETE Column stats: COMPLETE
+ tag: -1
+ value expressions: _col1 (type: string)
+ auto parallelism: false
+ Execution mode: vectorized, llap
+ LLAP IO: no inputs
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: test_table
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+ bucket_count 4
+ bucket_field_name col_1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns col_1,col_2,col_3
+ columns.comments
+ columns.types int:string:string
+#### A masked pattern was here ####
+ name default.test_table
+ numFiles 3
+ numRows 4
+ rawDataSize 42
+ serialization.ddl struct test_table { i32 col_1, string
col_2, string col_3}
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 46
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+ bucket_count 4
+ bucket_field_name col_1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns col_1,col_2,col_3
+ columns.comments
+ columns.types int:string:string
+#### A masked pattern was here ####
+ name default.test_table
+ numFiles 3
+ numRows 4
+ rawDataSize 42
+ serialization.ddl struct test_table { i32 col_1, string
col_2, string col_3}
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 46
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.test_table
+ name: default.test_table
+ Truncated Path -> Alias:
+ /test_table [test_table]
+ Reducer 2
+ Execution mode: vectorized, llap
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Select Operator
+ expressions: 2 (type: int), KEY.reducesinkkey0 (type: string),
VALUE._col0 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE
Column stats: COMPLETE
+#### A masked pattern was here ####
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:string
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels
true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2
order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2
order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+2 two TWO
+PREHOOK: query: drop table `test_table`
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test_table
+PREHOOK: Output: default@test_table
+POSTHOOK: query: drop table `test_table`
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test_table
+POSTHOOK: Output: default@test_table