[hive] branch branch-3 updated: HIVE-27251: Backport HIVE-22121: Turning on hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V)

sankarh Wed, 21 Jun 2023 01:40:45 -0700

This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/branch-3 by this push:
     new 642c72be666 HIVE-27251: Backport HIVE-22121: Turning on 
hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V)
642c72be666 is described below

commit 642c72be6663da860f28d3c50ded5d9ce25a01f1
Author: Aman Raj <[email protected]>
AuthorDate: Wed Jun 21 14:10:34 2023 +0530

    HIVE-27251: Backport HIVE-22121: Turning on hive.tez.bucket.pruning produce 
wrong results (Vineet Garg, reviewed by Gopal V)
    
    Signed-off-by: Sankar Hariappan <[email protected]>
    Closes (#4439)
---
 .../ql/optimizer/FixedBucketPruningOptimizer.java  |   4 +
 .../clientpositive/tez_fixed_bucket_pruning.q      |  14 +
 .../llap/tez_fixed_bucket_pruning.q.out            | 345 +++++++++++++++++++++
 3 files changed, 363 insertions(+)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
index 334b8e9babc..fff0904d844 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
@@ -125,6 +125,7 @@ public class FixedBucketPruningOptimizer extends Transform {
       for (StructField fs : tbl.getFields()) {
         if(fs.getFieldName().equals(bucketCol)) {
           bucketField = fs;
+          break;
         }
       }
       Preconditions.checkArgument(bucketField != null);
@@ -200,6 +201,9 @@ public class FixedBucketPruningOptimizer extends Transform {
             return;
           }
         }
+      } else if (expr.getOperator() == Operator.NOT) {
+        // TODO: think we can handle NOT IS_NULL?
+        return;
       }
       // invariant: bucket-col IN literals of type bucketField
       BitSet bs = new BitSet(numBuckets);
diff --git a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q 
b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
index cbc39977da1..21d5907d7c3 100644
--- a/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
+++ b/ql/src/test/queries/clientpositive/tez_fixed_bucket_pruning.q
@@ -221,3 +221,17 @@ where DW.PROJECT_OBJECT_ID =7147200
 order by DW.PROJECT_OBJECT_ID, PLAN_KEY, PROJECT_KEY
 limit 5;
 
+CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` string)
+        CLUSTERED BY (col_1) INTO 4 BUCKETS;
+
+insert into test_table values(1, 'one', 'ONE'), (2, 'two', 'TWO'), 
(3,'three','THREE'),(4,'four','FOUR');
+
+select * from test_table;
+
+explain extended select col_1, col_2, col_3 from test_table where col_1 <> 2 
order by col_2;
+select col_1, col_2, col_3 from test_table where col_1 <> 2 order by col_2;
+
+explain extended select col_1, col_2, col_3 from test_table where col_1 = 2 
order by col_2;
+select col_1, col_2, col_3 from test_table where col_1 = 2 order by col_2;
+
+drop table `test_table`;
\ No newline at end of file
diff --git 
a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out 
b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
index 26f741e2fe8..9241bea347c 100644
--- a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
+++ b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
@@ -1459,3 +1459,348 @@ POSTHOOK: Input: default@l3_monthly_dw_dimplan
 7147200        195775  27114
 7147200        234349  27114
 7147200        350519  27114
+PREHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` 
string)
+        CLUSTERED BY (col_1) INTO 4 BUCKETS
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_table
+POSTHOOK: query: CREATE TABLE `test_table`( `col_1` int,`col_2` string,`col_3` 
string)
+        CLUSTERED BY (col_1) INTO 4 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_table
+PREHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two', 
'TWO'), (3,'three','THREE'),(4,'four','FOUR')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_table
+POSTHOOK: query: insert into test_table values(1, 'one', 'ONE'), (2, 'two', 
'TWO'), (3,'three','THREE'),(4,'four','FOUR')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_table
+POSTHOOK: Lineage: test_table.col_1 SCRIPT []
+POSTHOOK: Lineage: test_table.col_2 SCRIPT []
+POSTHOOK: Lineage: test_table.col_3 SCRIPT []
+PREHOOK: query: select * from test_table
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test_table
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+2      two     TWO
+4      four    FOUR
+1      one     ONE
+3      three   THREE
+PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table 
where col_1 <> 2 order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table 
where col_1 <> 2 order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT `col_1`, `col_2`, `col_3`
+FROM `default`.`test_table`
+WHERE `col_1` <> 2
+ORDER BY `col_2`
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test_table
+                  filterExpr: (col_1 <> 2) (type: boolean)
+                  Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: (col_1 <> 2) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 720 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: col_1 (type: int), col_2 (type: string), 
col_3 (type: string)
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 720 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col1 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Statistics: Num rows: 4 Data size: 720 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        tag: -1
+                        value expressions: _col0 (type: int), _col2 (type: 
string)
+                        auto parallelism: false
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+            Path -> Alias:
+#### A masked pattern was here ####
+            Path -> Partition:
+#### A masked pattern was here ####
+                Partition
+                  base file name: test_table
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    COLUMN_STATS_ACCURATE 
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+                    bucket_count 4
+                    bucket_field_name col_1
+                    bucketing_version 2
+                    column.name.delimiter ,
+                    columns col_1,col_2,col_3
+                    columns.comments 
+                    columns.types int:string:string
+#### A masked pattern was here ####
+                    name default.test_table
+                    numFiles 3
+                    numRows 4
+                    rawDataSize 42
+                    serialization.ddl struct test_table { i32 col_1, string 
col_2, string col_3}
+                    serialization.format 1
+                    serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    totalSize 46
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    properties:
+                      COLUMN_STATS_ACCURATE 
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+                      bucket_count 4
+                      bucket_field_name col_1
+                      bucketing_version 2
+                      column.name.delimiter ,
+                      columns col_1,col_2,col_3
+                      columns.comments 
+                      columns.types int:string:string
+#### A masked pattern was here ####
+                      name default.test_table
+                      numFiles 3
+                      numRows 4
+                      rawDataSize 42
+                      serialization.ddl struct test_table { i32 col_1, string 
col_2, string col_3}
+                      serialization.format 1
+                      serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                      totalSize 46
+#### A masked pattern was here ####
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    name: default.test_table
+                  name: default.test_table
+            Truncated Path -> Alias:
+              /test_table [test_table]
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Needs Tagging: false
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 
(type: string), VALUE._col1 (type: string)
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 0
+#### A masked pattern was here ####
+                  NumFilesPerFileSink: 1
+                  Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE 
Column stats: COMPLETE
+#### A masked pattern was here ####
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      properties:
+                        columns _col0,_col1,_col2
+                        columns.types int:string:string
+                        escape.delim \
+                        hive.serialization.extend.additional.nesting.levels 
true
+                        serialization.escape.crlf true
+                        serialization.format 1
+                        serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  TotalFiles: 1
+                  GatherStats: false
+                  MultiFileSpray: false
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2 
order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 <> 2 
order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+4      four    FOUR
+1      one     ONE
+3      three   THREE
+PREHOOK: query: explain extended select col_1, col_2, col_3 from test_table 
where col_1 = 2 order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: explain extended select col_1, col_2, col_3 from test_table 
where col_1 = 2 order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT CAST(2 AS INTEGER) AS `col_1`, `col_2`, `col_3`
+FROM (SELECT `col_2`, `col_3`
+FROM `default`.`test_table`
+WHERE `col_1` = 2
+ORDER BY `col_2`) AS `t1`
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test_table
+                  filterExpr: (col_1 = 2) (type: boolean)
+                  buckets included: [0,] of 4
+                  Statistics: Num rows: 4 Data size: 720 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: (col_1 = 2) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 180 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: col_2 (type: string), col_3 (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 1 Data size: 176 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Statistics: Num rows: 1 Data size: 176 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        tag: -1
+                        value expressions: _col1 (type: string)
+                        auto parallelism: false
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+            Path -> Alias:
+#### A masked pattern was here ####
+            Path -> Partition:
+#### A masked pattern was here ####
+                Partition
+                  base file name: test_table
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    COLUMN_STATS_ACCURATE 
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+                    bucket_count 4
+                    bucket_field_name col_1
+                    bucketing_version 2
+                    column.name.delimiter ,
+                    columns col_1,col_2,col_3
+                    columns.comments 
+                    columns.types int:string:string
+#### A masked pattern was here ####
+                    name default.test_table
+                    numFiles 3
+                    numRows 4
+                    rawDataSize 42
+                    serialization.ddl struct test_table { i32 col_1, string 
col_2, string col_3}
+                    serialization.format 1
+                    serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    totalSize 46
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    properties:
+                      COLUMN_STATS_ACCURATE 
{"BASIC_STATS":"true","COLUMN_STATS":{"col_1":"true","col_2":"true","col_3":"true"}}
+                      bucket_count 4
+                      bucket_field_name col_1
+                      bucketing_version 2
+                      column.name.delimiter ,
+                      columns col_1,col_2,col_3
+                      columns.comments 
+                      columns.types int:string:string
+#### A masked pattern was here ####
+                      name default.test_table
+                      numFiles 3
+                      numRows 4
+                      rawDataSize 42
+                      serialization.ddl struct test_table { i32 col_1, string 
col_2, string col_3}
+                      serialization.format 1
+                      serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                      totalSize 46
+#### A masked pattern was here ####
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    name: default.test_table
+                  name: default.test_table
+            Truncated Path -> Alias:
+              /test_table [test_table]
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Needs Tagging: false
+            Reduce Operator Tree:
+              Select Operator
+                expressions: 2 (type: int), KEY.reducesinkkey0 (type: string), 
VALUE._col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 0
+#### A masked pattern was here ####
+                  NumFilesPerFileSink: 1
+                  Statistics: Num rows: 1 Data size: 180 Basic stats: COMPLETE 
Column stats: COMPLETE
+#### A masked pattern was here ####
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      properties:
+                        columns _col0,_col1,_col2
+                        columns.types int:string:string
+                        escape.delim \
+                        hive.serialization.extend.additional.nesting.levels 
true
+                        serialization.escape.crlf true
+                        serialization.format 1
+                        serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  TotalFiles: 1
+                  GatherStats: false
+                  MultiFileSpray: false
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2 
order by col_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_table
+#### A masked pattern was here ####
+POSTHOOK: query: select col_1, col_2, col_3 from test_table where col_1 = 2 
order by col_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_table
+#### A masked pattern was here ####
+2      two     TWO
+PREHOOK: query: drop table `test_table`
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test_table
+PREHOOK: Output: default@test_table
+POSTHOOK: query: drop table `test_table`
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test_table
+POSTHOOK: Output: default@test_table

[hive] branch branch-3 updated: HIVE-27251: Backport HIVE-22121: Turning on hive.tez.bucket.pruning produce wrong results (Vineet Garg, reviewed by Gopal V)

Reply via email to