[hive] branch master updated: HIVE-21778: CBO: "Struct is not null" gets evaluated as `nullable` always causing filter miss in the query (Vineet Garg, reviewed by Jesus Camacho Rodriguez)

vgarg Fri, 13 Mar 2020 09:14:51 -0700

This is an automated email from the ASF dual-hosted git repository.

vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 64a020b  HIVE-21778: CBO: "Struct is not null" gets evaluated as 
`nullable` always causing filter miss in the query (Vineet Garg, reviewed by 
Jesus Camacho Rodriguez)
64a020b is described below

commit 64a020b2392cdfd6d2230a99a010dbeafd31eaf5
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Fri Mar 13 09:13:22 2020 -0700

    HIVE-21778: CBO: "Struct is not null" gets evaluated as `nullable` always 
causing filter miss in the query (Vineet Garg, reviewed by Jesus Camacho 
Rodriguez)
---
 .../HiveProjectFilterPullUpConstantsRule.java      |   2 +-
 .../optimizer/calcite/translator/ASTBuilder.java   |   4 +-
 .../calcite/translator/TypeConverter.java          |   3 +-
 ql/src/test/queries/clientpositive/structin.q      |  18 +++
 .../results/clientpositive/intersect_all_rj.q.out  |  19 ++-
 .../results/clientpositive/llap/subquery_in.q.out  |   8 +-
 .../llap/vector_identity_reuse.q.out               |  42 +++----
 .../clientpositive/perf/spark/query93.q.out        |  18 +--
 .../clientpositive/perf/tez/cbo_query93.q.out      |  10 +-
 .../results/clientpositive/perf/tez/query93.q.out  |  18 +--
 ql/src/test/results/clientpositive/structin.q.out  | 127 +++++++++++++++++++++
 11 files changed, 208 insertions(+), 61 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectFilterPullUpConstantsRule.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectFilterPullUpConstantsRule.java
index 25227a3..efabe5d 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectFilterPullUpConstantsRule.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveProjectFilterPullUpConstantsRule.java
@@ -119,7 +119,7 @@ public class HiveProjectFilterPullUpConstantsRule extends 
RelOptRule {
         case IS_NULL:
           conditions.put(conjCall.operands.get(0).toString(),
                   relBuilder.getRexBuilder().makeNullLiteral(
-                          
conjCall.operands.get(0).getType().getSqlTypeName()));
+                          conjCall.operands.get(0).getType()));
       }
     }
 
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java
index 7328b72..f714f07 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java
@@ -243,6 +243,7 @@ public class ASTBuilder {
     case INTERVAL_SECOND:
     case INTERVAL_YEAR:
     case INTERVAL_YEAR_MONTH:
+    case ROW:
       if (literal.getValue() == null) {
         return ASTBuilder.construct(HiveParser.TOK_NULL, "TOK_NULL").node();
       }
@@ -364,8 +365,9 @@ public class ASTBuilder {
       type = HiveParser.TOK_NULL;
       break;
 
-    //binary type should not be seen.
+    //binary, ROW type should not be seen.
     case BINARY:
+    case ROW:
     default:
       throw new RuntimeException("Unsupported Type: " + sqlType);
     }
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/TypeConverter.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/TypeConverter.java
index edb3199..ed4a73e 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/TypeConverter.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/TypeConverter.java
@@ -160,7 +160,8 @@ public class TypeConverter {
       convertedType = convert((UnionTypeInfo) type, dtFactory);
       break;
     }
-    return convertedType;
+    // hive does not have concept of not nullable types
+    return dtFactory.createTypeWithNullability(convertedType, true);
   }
 
   public static RelDataType convert(PrimitiveTypeInfo type, RelDataTypeFactory 
dtFactory) {
diff --git a/ql/src/test/queries/clientpositive/structin.q 
b/ql/src/test/queries/clientpositive/structin.q
index 9621195..7d8a6e4 100644
--- a/ql/src/test/queries/clientpositive/structin.q
+++ b/ql/src/test/queries/clientpositive/structin.q
@@ -21,3 +21,21 @@ IN (
 struct('1234-1111-0074578664','3'),
 struct('1234-1111-0074578695',1)
 );
+
+CREATE TABLE test_struct
+(
+  f1 string,
+  demo_struct struct<f1:string, f2:string, f3:string>,
+  datestr string
+);
+
+insert into test_struct values('s1', named_struct('f1','1', 'f2','2', 
'f3','3'), '02-02-2020');
+insert into test_struct values('s2', named_struct('f1',cast(null as 
string),'f2', cast(null as string),'f3', cast(null as string)), '02-02-2020');
+insert into test_struct values('s4', named_struct('f1','100', 'f2','200', 
'f3','300'), '02-02-2020');
+
+explain select * from test_struct where datestr='02-02-2020' and demo_struct 
is not null
+ order by f1;
+select * from test_struct where datestr='02-02-2020' and demo_struct is not 
null
+ order by f1;
+
+DROP TABLE test_struct;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/intersect_all_rj.q.out 
b/ql/src/test/results/clientpositive/intersect_all_rj.q.out
index 427b841..a807f11 100644
--- a/ql/src/test/results/clientpositive/intersect_all_rj.q.out
+++ b/ql/src/test/results/clientpositive/intersect_all_rj.q.out
@@ -178,16 +178,15 @@ HiveProject($f0=[$1])
             HiveUnion(all=[true])
               HiveProject($f0=[$0], $f1=[$1])
                 HiveAggregate(group=[{0}], agg#0=[count()])
-                  HiveProject($f0=[$0])
-                    HiveAggregate(group=[{0}])
-                      HiveProject($f0=[CASE(IS NOT NULL($3), $3, if($1, $4, 
$2))])
-                        HiveJoin(condition=[>=($0, $5)], joinType=[inner], 
algorithm=[none], cost=[not available])
-                          HiveProject(bigint_col_3=[$1])
-                            HiveFilter(condition=[IS NOT NULL($1)])
-                              HiveTableScan(table=[[default, table_7]], 
table:alias=[a3])
-                          HiveProject(boolean_col_16=[$0], 
timestamp_col_5=[$1], timestamp_col_15=[$2], timestamp_col_30=[$3], 
CAST=[CAST($4):BIGINT])
-                            HiveFilter(condition=[IS NOT 
NULL(CAST($4):BIGINT)])
-                              HiveTableScan(table=[[default, table_10]], 
table:alias=[a4])
+                  HiveProject(CASE=[$0])
+                    HiveAggregate(group=[{1}])
+                      HiveJoin(condition=[>=($0, $2)], joinType=[inner], 
algorithm=[none], cost=[not available])
+                        HiveProject(bigint_col_3=[$1])
+                          HiveFilter(condition=[IS NOT NULL($1)])
+                            HiveTableScan(table=[[default, table_7]], 
table:alias=[a3])
+                        HiveProject(CASE=[CASE(IS NOT NULL($2), $2, if($0, $3, 
$1))], CAST=[CAST($4):BIGINT])
+                          HiveFilter(condition=[IS NOT NULL(CAST($4):BIGINT)])
+                            HiveTableScan(table=[[default, table_10]], 
table:alias=[a4])
               HiveProject($f0=[$0], $f1=[$1])
                 HiveAggregate(group=[{0}], agg#0=[count()])
                   HiveProject($f0=[$0])
diff --git a/ql/src/test/results/clientpositive/llap/subquery_in.q.out 
b/ql/src/test/results/clientpositive/llap/subquery_in.q.out
index 966215c..60522c8 100644
--- a/ql/src/test/results/clientpositive/llap/subquery_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/subquery_in.q.out
@@ -4028,13 +4028,13 @@ STAGE PLANS:
                     Select Operator
                       expressions: p_name (type: string), p_type (type: string)
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 26 Data size: 5954 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 26 Data size: 5850 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col1 (type: string)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col1 (type: string)
-                        Statistics: Num rows: 26 Data size: 5954 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 26 Data size: 5850 Basic stats: 
COMPLETE Column stats: COMPLETE
                         value expressions: _col0 (type: string)
             Execution mode: vectorized, llap
             LLAP IO: no inputs
@@ -4087,10 +4087,10 @@ STAGE PLANS:
                 keys:
                   0 _col1 (type: string)
                   1 _col0 (type: string)
-                outputColumnNames: _col0, _col4
+                outputColumnNames: _col0, _col3
                 Statistics: Num rows: 28 Data size: 3500 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: _col0 (type: string), _col4 (type: int)
+                  expressions: _col0 (type: string), _col3 (type: int)
                   outputColumnNames: _col0, _col1
                   Statistics: Num rows: 28 Data size: 3500 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Group By Operator
diff --git 
a/ql/src/test/results/clientpositive/llap/vector_identity_reuse.q.out 
b/ql/src/test/results/clientpositive/llap/vector_identity_reuse.q.out
index 77e118c..6db296d 100644
--- a/ql/src/test/results/clientpositive/llap/vector_identity_reuse.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_identity_reuse.q.out
@@ -172,13 +172,13 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [0, 1]
-                      Statistics: Num rows: 1 Data size: 12 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 1 Data size: 99 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Map Join Operator
                         condition map:
                              Left Outer Join 0 to 1
                         keys:
                           0 _col0 (type: bigint)
-                          1 _col3 (type: bigint)
+                          1 UDFToLong(_col2) (type: bigint)
                         Map Join Vectorization:
                             bigTableKeyColumns: 0:bigint
                             bigTableRetainColumnNums: [1]
@@ -189,7 +189,7 @@ STAGE PLANS:
                             projectedOutput: 1:int, 4:int, 5:smallint, 6:int
                             smallTableValueMapping: 4:int, 5:smallint, 6:int
                             hashTableImplementationType: OPTIMIZED
-                        outputColumnNames: _col1, _col2, _col3, _col4
+                        outputColumnNames: _col1, _col3, _col4, _col5
                         input vertices:
                           1 Map 3
                         Statistics: Num rows: 1 Data size: 16 Basic stats: 
COMPLETE Column stats: COMPLETE
@@ -198,15 +198,16 @@ STAGE PLANS:
                               className: VectorFilterOperator
                               native: true
                               predicateExpression: FilterExprOrExpr(children: 
FilterLongColEqualLongScalar(col 1:int, val 5), 
FilterLongColEqualLongScalar(col 6:int, val 10), 
FilterLongColEqualLongScalar(col 6:bigint, val 571)(children: col 6:int))
-                          predicate: ((_col1 = 5) or (_col4 = 10) or 
(UDFToLong(_col4) = 571L)) (type: boolean)
+                          predicate: ((_col1 = 5) or (_col5 = 10) or 
(UDFToLong(_col5) = 571L)) (type: boolean)
                           Statistics: Num rows: 1 Data size: 16 Basic stats: 
COMPLETE Column stats: COMPLETE
                           Select Operator
-                            expressions: _col2 (type: int), _col3 (type: 
smallint), _col4 (type: int)
+                            expressions: _col3 (type: int), _col5 (type: int), 
CASE WHEN (_col3 is not null) THEN (_col3) ELSE (UDFToInteger(_col4)) END 
(type: int)
                             outputColumnNames: _col0, _col1, _col2
                             Select Vectorization:
                                 className: VectorSelectOperator
                                 native: true
-                                projectedOutputColumnNums: [4, 5, 6]
+                                projectedOutputColumnNums: [4, 6, 8]
+                                selectExpressions: IfExprColumnCondExpr(col 
7:boolean, col 4:intcol 5:smallint)(children: IsNotNull(col 4:int) -> 
7:boolean, col 4:int, col 5:smallint) -> 8:int
                             Statistics: Num rows: 1 Data size: 12 Basic stats: 
COMPLETE Column stats: COMPLETE
                             Map Join Operator
                               condition map:
@@ -215,13 +216,13 @@ STAGE PLANS:
                                 0 
                                 1 
                               Map Join Vectorization:
-                                  bigTableRetainColumnNums: [4, 5, 6]
-                                  bigTableValueColumns: 4:int, 5:smallint, 
6:int
+                                  bigTableRetainColumnNums: [4, 6, 8]
+                                  bigTableValueColumns: 4:int, 6:int, 8:int
                                   className: 
VectorMapJoinInnerBigOnlyMultiKeyOperator
                                   native: true
                                   nativeConditionsMet: 
hive.mapjoin.optimized.hashtable IS true, 
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS 
true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS 
true
                                   nonOuterSmallTableKeyMapping: []
-                                  projectedOutput: 4:int, 5:smallint, 6:int
+                                  projectedOutput: 4:int, 6:int, 8:int
                                   hashTableImplementationType: OPTIMIZED
                               outputColumnNames: _col0, _col1, _col2
                               input vertices:
@@ -238,13 +239,12 @@ STAGE PLANS:
                                     keyExpressions: col 4:int
                                     native: true
                                 Select Operator
-                                  expressions: _col0 (type: int), CASE WHEN 
(_col0 is not null) THEN (_col0) ELSE (UDFToInteger(_col1)) END (type: int), 
_col2 (type: int)
+                                  expressions: _col0 (type: int), _col2 (type: 
int), _col1 (type: int)
                                   outputColumnNames: _col0, _col1, _col2
                                   Select Vectorization:
                                       className: VectorSelectOperator
                                       native: true
                                       projectedOutputColumnNums: [4, 8, 6]
-                                      selectExpressions: 
IfExprColumnCondExpr(col 7:boolean, col 4:intcol 5:smallint)(children: 
IsNotNull(col 4:int) -> 7:boolean, col 4:int, col 5:smallint) -> 8:int
                                   Statistics: Num rows: 1 Data size: 12 Basic 
stats: COMPLETE Column stats: COMPLETE
                                   Reduce Output Operator
                                     key expressions: _col0 (type: int)
@@ -293,26 +293,26 @@ STAGE PLANS:
                     predicate: UDFToLong(CASE WHEN (int_col_5 is not null) 
THEN (int_col_5) ELSE (UDFToInteger(smallint_col_22)) END) is not null (type: 
boolean)
                     Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
                     Select Operator
-                      expressions: int_col_5 (type: int), smallint_col_22 
(type: smallint), CASE WHEN (int_col_5 is not null) THEN (int_col_5) ELSE 
(UDFToInteger(smallint_col_22)) END (type: int), UDFToLong(CASE WHEN (int_col_5 
is not null) THEN (int_col_5) ELSE (UDFToInteger(smallint_col_22)) END) (type: 
bigint)
-                      outputColumnNames: _col0, _col1, _col2, _col3
+                      expressions: int_col_5 (type: int), smallint_col_22 
(type: smallint), CASE WHEN (int_col_5 is not null) THEN (int_col_5) ELSE 
(UDFToInteger(smallint_col_22)) END (type: int)
+                      outputColumnNames: _col0, _col1, _col2
                       Select Vectorization:
                           className: VectorSelectOperator
                           native: true
-                          projectedOutputColumnNums: [1, 0, 6, 8]
-                          selectExpressions: IfExprColumnCondExpr(col 
5:boolean, col 1:intcol 0:smallint)(children: IsNotNull(col 1:int) -> 
5:boolean, col 1:int, col 0:smallint) -> 6:int, IfExprColumnCondExpr(col 
7:boolean, col 1:intcol 0:smallint)(children: IsNotNull(col 1:int) -> 
7:boolean, col 1:int, col 0:smallint) -> 8:int
-                      Statistics: Num rows: 1 Data size: 20 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          projectedOutputColumnNums: [1, 0, 6]
+                          selectExpressions: IfExprColumnCondExpr(col 
5:boolean, col 1:intcol 0:smallint)(children: IsNotNull(col 1:int) -> 
5:boolean, col 1:int, col 0:smallint) -> 6:int
+                      Statistics: Num rows: 1 Data size: 12 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
-                        key expressions: _col3 (type: bigint)
+                        key expressions: UDFToLong(_col2) (type: bigint)
                         null sort order: z
                         sort order: +
-                        Map-reduce partition columns: _col3 (type: bigint)
+                        Map-reduce partition columns: UDFToLong(_col2) (type: 
bigint)
                         Reduce Sink Vectorization:
                             className: VectorReduceSinkLongOperator
-                            keyColumns: 8:bigint
+                            keyColumns: 6:bigint
                             native: true
                             nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
                             valueColumns: 1:int, 0:smallint, 6:int
-                        Statistics: Num rows: 1 Data size: 20 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 1 Data size: 12 Basic stats: 
COMPLETE Column stats: COMPLETE
                         value expressions: _col0 (type: int), _col1 (type: 
smallint), _col2 (type: int)
             Execution mode: vectorized, llap
             LLAP IO: no inputs
@@ -330,7 +330,7 @@ STAGE PLANS:
                     includeColumns: [0, 1]
                     dataColumns: smallint_col_22:smallint, int_col_5:int
                     partitionColumnCount: 0
-                    scratchColumnTypeNames: [bigint, bigint, bigint, bigint, 
bigint, bigint]
+                    scratchColumnTypeNames: [bigint, bigint, bigint, bigint]
         Map 4 
             Map Operator Tree:
                 TableScan
diff --git a/ql/src/test/results/clientpositive/perf/spark/query93.q.out 
b/ql/src/test/results/clientpositive/perf/spark/query93.q.out
index 5cc7058..09441d2 100644
--- a/ql/src/test/results/clientpositive/perf/spark/query93.q.out
+++ b/ql/src/test/results/clientpositive/perf/spark/query93.q.out
@@ -89,8 +89,8 @@ STAGE PLANS:
                     predicate: (sr_item_sk is not null and sr_ticket_number is 
not null and sr_reason_sk is not null) (type: boolean)
                     Statistics: Num rows: 57591150 Data size: 4462194832 Basic 
stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: sr_item_sk (type: int), sr_reason_sk (type: 
int), sr_ticket_number (type: int), sr_return_quantity (type: int)
-                      outputColumnNames: _col0, _col1, _col2, _col3
+                      expressions: sr_item_sk (type: int), sr_reason_sk (type: 
int), sr_ticket_number (type: int), sr_return_quantity (type: int), 
sr_return_quantity is not null (type: boolean)
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
                       Statistics: Num rows: 57591150 Data size: 4462194832 
Basic stats: COMPLETE Column stats: NONE
                       Map Join Operator
                         condition map:
@@ -98,7 +98,7 @@ STAGE PLANS:
                         keys:
                           0 _col1 (type: int)
                           1 _col0 (type: int)
-                        outputColumnNames: _col0, _col2, _col3
+                        outputColumnNames: _col0, _col2, _col3, _col4
                         input vertices:
                           1 Map 5
                         Statistics: Num rows: 63350266 Data size: 4908414421 
Basic stats: COMPLETE Column stats: NONE
@@ -108,7 +108,7 @@ STAGE PLANS:
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: int), 
_col2 (type: int)
                           Statistics: Num rows: 63350266 Data size: 4908414421 
Basic stats: COMPLETE Column stats: NONE
-                          value expressions: _col3 (type: int)
+                          value expressions: _col3 (type: int), _col4 (type: 
boolean)
             Execution mode: vectorized
             Local Work:
               Map Reduce Local Work
@@ -122,8 +122,8 @@ STAGE PLANS:
                     predicate: (ss_item_sk is not null and ss_ticket_number is 
not null) (type: boolean)
                     Statistics: Num rows: 575995635 Data size: 50814502088 
Basic stats: COMPLETE Column stats: NONE
                     Select Operator
-                      expressions: ss_item_sk (type: int), ss_customer_sk 
(type: int), ss_ticket_number (type: int), ss_quantity (type: int), 
ss_sales_price (type: decimal(7,2))
-                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      expressions: ss_item_sk (type: int), ss_customer_sk 
(type: int), ss_ticket_number (type: int), ss_quantity (type: int), 
ss_sales_price (type: decimal(7,2)), (CAST( ss_quantity AS decimal(10,0)) * 
ss_sales_price) (type: decimal(18,2))
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5
                       Statistics: Num rows: 575995635 Data size: 50814502088 
Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: int), _col2 (type: int)
@@ -131,7 +131,7 @@ STAGE PLANS:
                         sort order: ++
                         Map-reduce partition columns: _col0 (type: int), _col2 
(type: int)
                         Statistics: Num rows: 575995635 Data size: 50814502088 
Basic stats: COMPLETE Column stats: NONE
-                        value expressions: _col1 (type: int), _col3 (type: 
int), _col4 (type: decimal(7,2))
+                        value expressions: _col1 (type: int), _col3 (type: 
int), _col4 (type: decimal(7,2)), _col5 (type: decimal(18,2))
             Execution mode: vectorized
         Reducer 2 
             Reduce Operator Tree:
@@ -141,10 +141,10 @@ STAGE PLANS:
                 keys:
                   0 _col0 (type: int), _col2 (type: int)
                   1 _col0 (type: int), _col2 (type: int)
-                outputColumnNames: _col3, _col6, _col8, _col9
+                outputColumnNames: _col3, _col4, _col7, _col9, _col10, _col11
                 Statistics: Num rows: 633595212 Data size: 55895953508 Basic 
stats: COMPLETE Column stats: NONE
                 Select Operator
-                  expressions: _col6 (type: int), CASE WHEN (_col3 is not 
null) THEN ((CAST( (_col8 - _col3) AS decimal(10,0)) * _col9)) ELSE ((CAST( 
_col8 AS decimal(10,0)) * _col9)) END (type: decimal(18,2))
+                  expressions: _col7 (type: int), CASE WHEN (_col4) THEN 
((CAST( (_col9 - _col3) AS decimal(10,0)) * _col10)) ELSE (_col11) END (type: 
decimal(18,2))
                   outputColumnNames: _col0, _col1
                   Statistics: Num rows: 633595212 Data size: 55895953508 Basic 
stats: COMPLETE Column stats: NONE
                   Group By Operator
diff --git a/ql/src/test/results/clientpositive/perf/tez/cbo_query93.q.out 
b/ql/src/test/results/clientpositive/perf/tez/cbo_query93.q.out
index b0b8169..6a8ed39 100644
--- a/ql/src/test/results/clientpositive/perf/tez/cbo_query93.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/cbo_query93.q.out
@@ -44,13 +44,13 @@ CBO PLAN:
 HiveSortLimit(sort0=[$1], sort1=[$0], dir0=[ASC], dir1=[ASC], fetch=[100])
   HiveProject(ss_customer_sk=[$0], $f1=[$1])
     HiveAggregate(group=[{0}], agg#0=[sum($1)])
-      HiveProject(ss_customer_sk=[$1], act_sales=[CASE(IS NOT NULL($8), 
*(CAST(-($3, $8)):DECIMAL(10, 0), $4), *(CAST($3):DECIMAL(10, 0), $4))])
-        HiveJoin(condition=[AND(=($5, $0), =($7, $2))], joinType=[inner], 
algorithm=[none], cost=[not available])
-          HiveProject(ss_item_sk=[$2], ss_customer_sk=[$3], 
ss_ticket_number=[$9], ss_quantity=[$10], ss_sales_price=[$13])
+      HiveProject(ss_customer_sk=[$1], act_sales=[CASE($10, *(CAST(-($3, 
$9)):DECIMAL(10, 0), $4), $5)])
+        HiveJoin(condition=[AND(=($6, $0), =($8, $2))], joinType=[inner], 
algorithm=[none], cost=[not available])
+          HiveProject(ss_item_sk=[$2], ss_customer_sk=[$3], 
ss_ticket_number=[$9], ss_quantity=[$10], ss_sales_price=[$13], 
*=[*(CAST($10):DECIMAL(10, 0), $13)])
             HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($9))])
               HiveTableScan(table=[[default, store_sales]], 
table:alias=[store_sales])
-          HiveJoin(condition=[=($1, $4)], joinType=[inner], algorithm=[none], 
cost=[not available])
-            HiveProject(sr_item_sk=[$2], sr_reason_sk=[$8], 
sr_ticket_number=[$9], sr_return_quantity=[$10])
+          HiveJoin(condition=[=($1, $5)], joinType=[inner], algorithm=[none], 
cost=[not available])
+            HiveProject(sr_item_sk=[$2], sr_reason_sk=[$8], 
sr_ticket_number=[$9], sr_return_quantity=[$10], IS NOT NULL=[IS NOT NULL($10)])
               HiveFilter(condition=[AND(IS NOT NULL($8), IS NOT NULL($2), IS 
NOT NULL($9))])
                 HiveTableScan(table=[[default, store_returns]], 
table:alias=[store_returns])
             HiveProject(r_reason_sk=[$0])
diff --git a/ql/src/test/results/clientpositive/perf/tez/query93.q.out 
b/ql/src/test/results/clientpositive/perf/tez/query93.q.out
index d0bde22..6a47c90 100644
--- a/ql/src/test/results/clientpositive/perf/tez/query93.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/query93.q.out
@@ -69,15 +69,15 @@ Stage-0
                     PartitionCols:_col0
                     Group By Operator [GBY_17] (rows=569016747 width=115)
                       
Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0
-                      Select Operator [SEL_15] (rows=569016747 width=118)
+                      Select Operator [SEL_15] (rows=569016747 width=234)
                         Output:["_col0","_col1"]
-                        Merge Join Operator [MERGEJOIN_65] (rows=569016747 
width=118)
-                          Conds:RS_12._col0, _col2=RS_74._col0, 
_col2(Inner),Output:["_col3","_col6","_col8","_col9"]
+                        Merge Join Operator [MERGEJOIN_65] (rows=569016747 
width=234)
+                          Conds:RS_12._col0, _col2=RS_74._col0, 
_col2(Inner),Output:["_col3","_col4","_col7","_col9","_col10","_col11"]
                         <-Map 7 [SIMPLE_EDGE] vectorized
                           SHUFFLE [RS_74]
                             PartitionCols:_col0, _col2
-                            Select Operator [SEL_73] (rows=575995635 width=122)
-                              Output:["_col0","_col1","_col2","_col3","_col4"]
+                            Select Operator [SEL_73] (rows=575995635 width=234)
+                              
Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
                               Filter Operator [FIL_72] (rows=575995635 
width=122)
                                 predicate:(ss_item_sk is not null and 
ss_ticket_number is not null)
                                 TableScan [TS_6] (rows=575995635 width=122)
@@ -85,13 +85,13 @@ Stage-0
                         <-Reducer 2 [SIMPLE_EDGE]
                           SHUFFLE [RS_12]
                             PartitionCols:_col0, _col2
-                            Merge Join Operator [MERGEJOIN_64] (rows=55574563 
width=11)
-                              
Conds:RS_68._col1=RS_71._col0(Inner),Output:["_col0","_col2","_col3"]
+                            Merge Join Operator [MERGEJOIN_64] (rows=55574563 
width=15)
+                              
Conds:RS_68._col1=RS_71._col0(Inner),Output:["_col0","_col2","_col3","_col4"]
                             <-Map 1 [SIMPLE_EDGE] vectorized
                               SHUFFLE [RS_68]
                                 PartitionCols:_col1
-                                Select Operator [SEL_67] (rows=55574563 
width=15)
-                                  Output:["_col0","_col1","_col2","_col3"]
+                                Select Operator [SEL_67] (rows=55574563 
width=19)
+                                  
Output:["_col0","_col1","_col2","_col3","_col4"]
                                   Filter Operator [FIL_66] (rows=55574563 
width=15)
                                     predicate:(sr_reason_sk is not null and 
sr_item_sk is not null and sr_ticket_number is not null)
                                     TableScan [TS_0] (rows=57591150 width=15)
diff --git a/ql/src/test/results/clientpositive/structin.q.out 
b/ql/src/test/results/clientpositive/structin.q.out
index 0fe88f5..49c4b56 100644
--- a/ql/src/test/results/clientpositive/structin.q.out
+++ b/ql/src/test/results/clientpositive/structin.q.out
@@ -119,3 +119,130 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: CREATE TABLE test_struct
+(
+  f1 string,
+  demo_struct struct<f1:string, f2:string, f3:string>,
+  datestr string
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_struct
+POSTHOOK: query: CREATE TABLE test_struct
+(
+  f1 string,
+  demo_struct struct<f1:string, f2:string, f3:string>,
+  datestr string
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_struct
+PREHOOK: query: insert into test_struct values('s1', named_struct('f1','1', 
'f2','2', 'f3','3'), '02-02-2020')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_struct
+POSTHOOK: query: insert into test_struct values('s1', named_struct('f1','1', 
'f2','2', 'f3','3'), '02-02-2020')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_struct
+POSTHOOK: Lineage: test_struct.datestr SCRIPT []
+POSTHOOK: Lineage: test_struct.demo_struct SCRIPT []
+POSTHOOK: Lineage: test_struct.f1 SCRIPT []
+PREHOOK: query: insert into test_struct values('s2', 
named_struct('f1',cast(null as string),'f2', cast(null as string),'f3', 
cast(null as string)), '02-02-2020')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_struct
+POSTHOOK: query: insert into test_struct values('s2', 
named_struct('f1',cast(null as string),'f2', cast(null as string),'f3', 
cast(null as string)), '02-02-2020')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_struct
+POSTHOOK: Lineage: test_struct.datestr SCRIPT []
+POSTHOOK: Lineage: test_struct.demo_struct SCRIPT []
+POSTHOOK: Lineage: test_struct.f1 SCRIPT []
+PREHOOK: query: insert into test_struct values('s4', named_struct('f1','100', 
'f2','200', 'f3','300'), '02-02-2020')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_struct
+POSTHOOK: query: insert into test_struct values('s4', named_struct('f1','100', 
'f2','200', 'f3','300'), '02-02-2020')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_struct
+POSTHOOK: Lineage: test_struct.datestr SCRIPT []
+POSTHOOK: Lineage: test_struct.demo_struct SCRIPT []
+POSTHOOK: Lineage: test_struct.f1 SCRIPT []
+PREHOOK: query: explain select * from test_struct where datestr='02-02-2020' 
and demo_struct is not null
+ order by f1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_struct
+#### A masked pattern was here ####
+POSTHOOK: query: explain select * from test_struct where datestr='02-02-2020' 
and demo_struct is not null
+ order by f1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_struct
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: test_struct
+            filterExpr: ((datestr = '02-02-2020') and demo_struct is not null) 
(type: boolean)
+            Statistics: Num rows: 3 Data size: 2658 Basic stats: COMPLETE 
Column stats: NONE
+            Filter Operator
+              predicate: ((datestr = '02-02-2020') and demo_struct is not 
null) (type: boolean)
+              Statistics: Num rows: 3 Data size: 2658 Basic stats: COMPLETE 
Column stats: NONE
+              Select Operator
+                expressions: f1 (type: string), demo_struct (type: 
struct<f1:string,f2:string,f3:string>)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 3 Data size: 2658 Basic stats: COMPLETE 
Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  null sort order: z
+                  sort order: +
+                  Statistics: Num rows: 3 Data size: 2658 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: 
struct<f1:string,f2:string,f3:string>)
+      Execution mode: vectorized
+      Reduce Operator Tree:
+        Select Operator
+          expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: 
struct<f1:string,f2:string,f3:string>), '02-02-2020' (type: string)
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 3 Data size: 2658 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 3 Data size: 2658 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from test_struct where datestr='02-02-2020' and 
demo_struct is not null
+ order by f1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_struct
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test_struct where datestr='02-02-2020' and 
demo_struct is not null
+ order by f1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_struct
+#### A masked pattern was here ####
+s1     {"f1":"1","f2":"2","f3":"3"}    02-02-2020
+s2     {"f1":null,"f2":null,"f3":null} 02-02-2020
+s4     {"f1":"100","f2":"200","f3":"300"}      02-02-2020
+PREHOOK: query: DROP TABLE test_struct
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test_struct
+PREHOOK: Output: default@test_struct
+POSTHOOK: query: DROP TABLE test_struct
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test_struct
+POSTHOOK: Output: default@test_struct

[hive] branch master updated: HIVE-21778: CBO: "Struct is not null" gets evaluated as `nullable` always causing filter miss in the query (Vineet Garg, reviewed by Jesus Camacho Rodriguez)

Reply via email to