This is an automated email from the ASF dual-hosted git repository. vgarg pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new bea7ffd HIVE-22121: Turning on hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V) bea7ffd is described below commit bea7ffd9728d21f5424ca8bcd9b5c7bc0d0acb2f Author: Vineet Garg <vg...@apache.org> AuthorDate: Mon Aug 26 15:46:54 2019 -0700 HIVE-22121: Turning on hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V) --- .../ql/optimizer/FixedBucketPruningOptimizer.java | 4 + .../clientpositive/tez_fixed_bucket_pruning.q | 14 + .../llap/tez_fixed_bucket_pruning.q.out | 345 +++++++++++++++++++++ 3 files changed, 363 insertions(+) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java index 3a40b31..c5553fb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java @@ -120,6 +120,7 @@ public class FixedBucketPruningOptimizer extends Transform { for (StructField fs : tbl.getFields()) { if(fs.getFieldName().equals(bucketCol)) { bucketField = fs; + break; } } Preconditions.checkArgument(bucketField != null); @@ -195,6 +196,9 @@ public class FixedBucketPruningOptimizer extends Transform { return; } } + } else if (expr.getOperator() == Operator.NOT) { + // TODO: think we can handle NOT IS_NULL? + return; } // invariant: bucket-col IN literals of type bucketField BitSet bs = new BitSet(numBuckets); diff --git a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q index cbc3997..21d5907 100644 --- a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q +++ b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q @@ -221,3 +221,17 @@ where DW.PROJECT_OBJECT_ID =7147200 order by DW.PROJECT_OBJECT_ID, PLAN_KEY, PROJECT_KEY limit 5; +CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` string) + CLUSTERED BY (col_1) INTO 4 BUCKETS; + +insert into test_table values(1, 'one', 'ONE'), (2, 'two', 'TWO'), (3,'three','THREE'),(4,'four','FOUR'); + +select * from test_table; + +explain extended select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2; +select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2; + +explain extended select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2; +select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2; + +drop table `test_table`; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out index eaed60c..5367d33 100644 --- a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out +++ b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out @@ -1442,3 +1442,348 @@ POSTHOOK: Input: default@l3_monthly_dw_dimplan 7147200 195775 27114 7147200 234349 27114 7147200 350519 27114 +PREHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` string) + CLUSTERED BY (col_1) INTO 4 BUCKETS +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_table +POSTHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` string) + CLUSTERED BY (col_1) INTO 4 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_table +PREHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two', 'TWO'), (3,'three','THREE'),(4,'four','FOUR') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_table +POSTHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two', 'TWO'), (3,'three','THREE'),(4,'four','FOUR') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_table +POSTHOOK: Lineage: test_table.col_1 SCRIPT [] +POSTHOOK: Lineage: test_table.col_2 SCRIPT [] +POSTHOOK: Lineage: test_table.col_3 SCRIPT [] +PREHOOK: query: select * from test_table +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select * from test_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +2 two TWO +1 one ONE +4 four FOUR +3 three THREE +PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +OPTIMIZED SQL: SELECT `col_1`, `col_2`, `col_3` +FROM `default`.`test_table` +WHERE `col_1` <> 2 +ORDER BY `col_2` +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test_table + filterExpr: (col_1 <> 2) (type: boolean) + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (col_1 <> 2) (type: boolean) + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: col_1 (type: int), col_2 (type: string), col_3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: string) + null sort order: z + sort order: + + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col0 (type: int), _col2 (type: string) + auto parallelism: false + Execution mode: vectorized, llap + LLAP IO: no inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: test_table + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}} + bucket_count 4 + bucket_field_name col_1 + bucketing_version 2 + column.name.delimiter , + columns col_1,col_2,col_3 + columns.comments + columns.types int:string:string +#### A masked pattern was here #### + name default.test_table + numFiles 3 + numRows 4 + rawDataSize 42 + serialization.ddl struct test_table { i32 col_1, string col_2, string col_3} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 46 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}} + bucket_count 4 + bucket_field_name col_1 + bucketing_version 2 + column.name.delimiter , + columns col_1,col_2,col_3 + columns.comments + columns.types int:string:string +#### A masked pattern was here #### + name default.test_table + numFiles 3 + numRows 4 + rawDataSize 42 + serialization.ddl struct test_table { i32 col_1, string col_2, string col_3} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 46 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table + name: default.test_table + Truncated Path -> Alias: + /test_table [test_table] + Reducer 2 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: string), VALUE._col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +4 four FOUR +1 one ONE +3 three THREE +PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +OPTIMIZED SQL: SELECT CAST(2 AS INTEGER) AS `col_1`, `col_2`, `col_3` +FROM (SELECT `col_2`, `col_3` +FROM `default`.`test_table` +WHERE `col_1` = 2 +ORDER BY `col_2`) AS `t1` +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test_table + filterExpr: (col_1 = 2) (type: boolean) + buckets included: [0,] of 4 + Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (col_1 = 2) (type: boolean) + Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: col_2 (type: string), col_3 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col1 (type: string) + auto parallelism: false + Execution mode: vectorized, llap + LLAP IO: no inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: test_table + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}} + bucket_count 4 + bucket_field_name col_1 + bucketing_version 2 + column.name.delimiter , + columns col_1,col_2,col_3 + columns.comments + columns.types int:string:string +#### A masked pattern was here #### + name default.test_table + numFiles 3 + numRows 4 + rawDataSize 42 + serialization.ddl struct test_table { i32 col_1, string col_2, string col_3} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 46 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}} + bucket_count 4 + bucket_field_name col_1 + bucketing_version 2 + column.name.delimiter , + columns col_1,col_2,col_3 + columns.comments + columns.types int:string:string +#### A masked pattern was here #### + name default.test_table + numFiles 3 + numRows 4 + rawDataSize 42 + serialization.ddl struct test_table { i32 col_1, string col_2, string col_3} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 46 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_table + name: default.test_table + Truncated Path -> Alias: + /test_table [test_table] + Reducer 2 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: 2 (type: int), KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test_table +#### A masked pattern was here #### +POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_table +#### A masked pattern was here #### +2 two TWO +PREHOOK: query: drop table `test_table` +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_table +PREHOOK: Output: default@test_table +POSTHOOK: query: drop table `test_table` +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_table +POSTHOOK: Output: default@test_table