hive git commit: HIVE-19557: stats: filters for dates are not taking advantage of min/max values (Zoltan Haindrich reviewed by Ashutosh Chauhan)

kgyrtkirk Wed, 23 May 2018 08:16:19 -0700

Repository: hive
Updated Branches:
  refs/heads/master f34149ab5 -> 0b2d364aa



HIVE-19557: stats: filters for dates are not taking advantage of min/max values 
(Zoltan Haindrich reviewed by Ashutosh Chauhan)

Signed-off-by: Zoltan Haindrich <k...@rxd.hu>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0b2d364a
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0b2d364a
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0b2d364a

Branch: refs/heads/master
Commit: 0b2d364aa5977c7a15a4b5082cf9ef431dcc394e
Parents: f34149a
Author: Zoltan Haindrich <k...@rxd.hu>
Authored: Wed May 23 17:15:39 2018 +0200
Committer: Zoltan Haindrich <k...@rxd.hu>
Committed: Wed May 23 17:15:39 2018 +0200

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../stats/annotation/StatsRulesProcFactory.java |  10 +-
 .../clientpositive/colstats_date_min_max.q      |  30 +++
 .../llap/colstats_date_min_max.q.out            | 193 +++++++++++++++++++
 .../clientpositive/llap/vector_between_in.q.out |  20 +-
 5 files changed, 243 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index 6528ec6..6007d5a 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -488,6 +488,7 @@ minillaplocal.query.files=\
   cbo_subq_not_in.q,\
   column_table_stats.q,\
   column_table_stats_orc.q,\
+  colstats_date_min_max.q,\
   compare_double_bigint_2.q,\
   constprog_dpp.q,\
   current_date_timestamp.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index c770227..91cccfb 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -100,6 +100,8 @@ import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
 import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -751,8 +753,14 @@ public class StatsRulesProcFactory {
             }
           } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) ||
                   colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
+            int value;
+            if (colTypeLowerCase == serdeConstants.DATE_TYPE_NAME) {
+              DateWritable writableVal = new 
DateWritable(java.sql.Date.valueOf(boundValue));
+              value = writableVal.getDays();
+            } else {
+              value = new Integer(boundValue);
+            }
             // Date is an integer internally
-            int value = new Integer(boundValue);
             int maxValue = cs.getRange().maxValue.intValue();
             int minValue = cs.getRange().minValue.intValue();
             if (upperBound) {

http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/queries/clientpositive/colstats_date_min_max.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/colstats_date_min_max.q 
b/ql/src/test/queries/clientpositive/colstats_date_min_max.q
new file mode 100644
index 0000000..7f5be6a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/colstats_date_min_max.q
@@ -0,0 +1,30 @@
+set hive.explain.user=true;
+
+create table d1(d date);
+--  tblproperties('transactional'='false');
+
+insert into d1 values
+       ('2010-10-01'),
+       ('2010-10-02'),
+       ('2010-10-03'),
+       ('2010-10-04'),
+       ('2010-10-05'),
+       ('2010-10-06'),
+       ('2010-10-07'),
+       ('2010-10-08'),
+       ('2010-10-09'),
+       ('2010-10-10');
+
+analyze table d1 compute statistics for columns;
+
+desc formatted d1;
+desc formatted d1 d;
+
+explain
+select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01';
+
+explain
+select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03';
+
+explain
+select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03';

http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out 
b/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
new file mode 100644
index 0000000..7754f3e
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
@@ -0,0 +1,193 @@
+PREHOOK: query: create table d1(d date)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@d1
+POSTHOOK: query: create table d1(d date)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@d1
+PREHOOK: query: insert into d1 values
+       ('2010-10-01'),
+       ('2010-10-02'),
+       ('2010-10-03'),
+       ('2010-10-04'),
+       ('2010-10-05'),
+       ('2010-10-06'),
+       ('2010-10-07'),
+       ('2010-10-08'),
+       ('2010-10-09'),
+       ('2010-10-10')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@d1
+POSTHOOK: query: insert into d1 values
+       ('2010-10-01'),
+       ('2010-10-02'),
+       ('2010-10-03'),
+       ('2010-10-04'),
+       ('2010-10-05'),
+       ('2010-10-06'),
+       ('2010-10-07'),
+       ('2010-10-08'),
+       ('2010-10-09'),
+       ('2010-10-10')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@d1
+POSTHOOK: Lineage: d1.d SCRIPT []
+PREHOOK: query: analyze table d1 compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@d1
+PREHOOK: Output: default@d1
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table d1 compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@d1
+POSTHOOK: Output: default@d1
+#### A masked pattern was here ####
+PREHOOK: query: desc formatted d1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@d1
+POSTHOOK: query: desc formatted d1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@d1
+# col_name             data_type               comment             
+d                      date                                        
+                
+# Detailed Table Information            
+Database:              default                  
+#### A masked pattern was here ####
+Retention:             0                        
+#### A masked pattern was here ####
+Table Type:            MANAGED_TABLE            
+Table Parameters:               
+       COLUMN_STATS_ACCURATE   
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}}
+       bucketing_version       2                   
+       numFiles                1                   
+       numRows                 10                  
+       rawDataSize             100                 
+       totalSize               110                 
+#### A masked pattern was here ####
+                
+# Storage Information           
+SerDe Library:         org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe      
 
+InputFormat:           org.apache.hadoop.mapred.TextInputFormat         
+OutputFormat:          
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat       
+Compressed:            No                       
+Num Buckets:           -1                       
+Bucket Columns:        []                       
+Sort Columns:          []                       
+Storage Desc Params:            
+       serialization.format    1                   
+PREHOOK: query: desc formatted d1 d
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@d1
+POSTHOOK: query: desc formatted d1 d
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@d1
+col_name               d                                                       
                                         
+data_type              date                                                    
                                         
+min                    2010-10-01                                              
                                         
+max                    2010-10-10                                              
                                         
+num_nulls              0                                                       
                                         
+distinct_count         10                                                      
                                         
+avg_col_len                                                                    
                                         
+max_col_len                                                                    
                                         
+num_trues                                                                      
                                         
+num_falses                                                                     
                                         
+bitVector              HL                                                      
                                         
+comment                from deserializer                                       
                                         
+COLUMN_STATS_ACCURATE  
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}}                    
                                                         
+PREHOOK: query: explain
+select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01'
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+
+Stage-0
+  Fetch Operator
+    limit:-1
+    Stage-1
+      Reducer 2 vectorized, llap
+      File Output Operator [FS_15]
+        Select Operator [SEL_14] (rows=1 width=110)
+          Output:["_col0","_col1"]
+          Group By Operator [GBY_13] (rows=1 width=8)
+            Output:["_col0"],aggregations:["count(VALUE._col0)"]
+          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
+            PARTITION_ONLY_SHUFFLE [RS_12]
+              Group By Operator [GBY_11] (rows=1 width=8)
+                Output:["_col0"],aggregations:["count()"]
+                Select Operator [SEL_10] (rows=1 width=56)
+                  Filter Operator [FIL_9] (rows=1 width=56)
+                    predicate:(d < DATE'2010-03-01')
+                    TableScan [TS_0] (rows=10 width=56)
+                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
+
+PREHOOK: query: explain
+select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03'
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+
+Stage-0
+  Fetch Operator
+    limit:-1
+    Stage-1
+      Reducer 2 vectorized, llap
+      File Output Operator [FS_15]
+        Select Operator [SEL_14] (rows=1 width=121)
+          Output:["_col0","_col1"]
+          Group By Operator [GBY_13] (rows=1 width=8)
+            Output:["_col0"],aggregations:["count(VALUE._col0)"]
+          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
+            PARTITION_ONLY_SHUFFLE [RS_12]
+              Group By Operator [GBY_11] (rows=1 width=8)
+                Output:["_col0"],aggregations:["count()"]
+                Select Operator [SEL_10] (rows=3 width=56)
+                  Filter Operator [FIL_9] (rows=3 width=56)
+                    predicate:(d < DATE'2010-10-03')
+                    TableScan [TS_0] (rows=10 width=56)
+                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
+
+PREHOOK: query: explain
+select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03'
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+
+Stage-0
+  Fetch Operator
+    limit:-1
+    Stage-1
+      Reducer 2 vectorized, llap
+      File Output Operator [FS_15]
+        Select Operator [SEL_14] (rows=1 width=120)
+          Output:["_col0","_col1"]
+          Group By Operator [GBY_13] (rows=1 width=8)
+            Output:["_col0"],aggregations:["count(VALUE._col0)"]
+          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
+            PARTITION_ONLY_SHUFFLE [RS_12]
+              Group By Operator [GBY_11] (rows=1 width=8)
+                Output:["_col0"],aggregations:["count()"]
+                Select Operator [SEL_10] (rows=10 width=56)
+                  Filter Operator [FIL_9] (rows=10 width=56)
+                    predicate:(d < DATE'2010-11-03')
+                    TableScan [TS_0] (rows=10 width=56)
+                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
+

http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
index 6093beb..f76053e 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
@@ -465,7 +465,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: FilterLongColumnBetween(col 
3:date, left -2, right 1)
                     predicate: cdate BETWEEN DATE'1969-12-30' AND 
DATE'1970-01-02' (type: boolean)
-                    Statistics: Num rows: 1365 Data size: 72627 Basic stats: 
COMPLETE Column stats: NONE
+                    Statistics: Num rows: 4096 Data size: 217934 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cdate (type: date)
                       outputColumnNames: _col0
@@ -473,7 +473,7 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [3]
-                      Statistics: Num rows: 1365 Data size: 72627 Basic stats: 
COMPLETE Column stats: NONE
+                      Statistics: Num rows: 4096 Data size: 217934 Basic 
stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: date)
                         sort order: +
@@ -481,7 +481,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 1365 Data size: 72627 Basic 
stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 4096 Data size: 217934 Basic 
stats: COMPLETE Column stats: NONE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
             Map Vectorization:
@@ -509,13 +509,13 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0]
-                Statistics: Num rows: 1365 Data size: 72627 Basic stats: 
COMPLETE Column stats: NONE
+                Statistics: Num rows: 4096 Data size: 217934 Basic stats: 
COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   File Sink Vectorization:
                       className: VectorFileSinkOperator
                       native: false
-                  Statistics: Num rows: 1365 Data size: 72627 Basic stats: 
COMPLETE Column stats: NONE
+                  Statistics: Num rows: 4096 Data size: 217934 Basic stats: 
COMPLETE Column stats: NONE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -560,7 +560,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: FilterLongColumnNotBetween(col 
3:date, left -610, right 608)
                     predicate: cdate NOT BETWEEN DATE'1968-05-01' AND 
DATE'1971-09-01' (type: boolean)
-                    Statistics: Num rows: 10924 Data size: 581228 Basic stats: 
COMPLETE Column stats: NONE
+                    Statistics: Num rows: 8193 Data size: 435921 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cdate (type: date)
                       outputColumnNames: _col0
@@ -568,7 +568,7 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [3]
-                      Statistics: Num rows: 10924 Data size: 581228 Basic 
stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 8193 Data size: 435921 Basic 
stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: date)
                         sort order: +
@@ -576,7 +576,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 10924 Data size: 581228 Basic 
stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 8193 Data size: 435921 Basic 
stats: COMPLETE Column stats: NONE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
             Map Vectorization:
@@ -604,13 +604,13 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0]
-                Statistics: Num rows: 10924 Data size: 581228 Basic stats: 
COMPLETE Column stats: NONE
+                Statistics: Num rows: 8193 Data size: 435921 Basic stats: 
COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   File Sink Vectorization:
                       className: VectorFileSinkOperator
                       native: false
-                  Statistics: Num rows: 10924 Data size: 581228 Basic stats: 
COMPLETE Column stats: NONE
+                  Statistics: Num rows: 8193 Data size: 435921 Basic stats: 
COMPLETE Column stats: NONE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

hive git commit: HIVE-19557: stats: filters for dates are not taking advantage of min/max values (Zoltan Haindrich reviewed by Ashutosh Chauhan)

Reply via email to