Repository: hive Updated Branches: refs/heads/master f34149ab5 -> 0b2d364aa
HIVE-19557: stats: filters for dates are not taking advantage of min/max values (Zoltan Haindrich reviewed by Ashutosh Chauhan) Signed-off-by: Zoltan Haindrich <k...@rxd.hu> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0b2d364a Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0b2d364a Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0b2d364a Branch: refs/heads/master Commit: 0b2d364aa5977c7a15a4b5082cf9ef431dcc394e Parents: f34149a Author: Zoltan Haindrich <k...@rxd.hu> Authored: Wed May 23 17:15:39 2018 +0200 Committer: Zoltan Haindrich <k...@rxd.hu> Committed: Wed May 23 17:15:39 2018 +0200 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../stats/annotation/StatsRulesProcFactory.java | 10 +- .../clientpositive/colstats_date_min_max.q | 30 +++ .../llap/colstats_date_min_max.q.out | 193 +++++++++++++++++++ .../clientpositive/llap/vector_between_in.q.out | 20 +- 5 files changed, 243 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 6528ec6..6007d5a 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -488,6 +488,7 @@ minillaplocal.query.files=\ cbo_subq_not_in.q,\ column_table_stats.q,\ column_table_stats_orc.q,\ + colstats_date_min_max.q,\ compare_double_bigint_2.q,\ constprog_dpp.q,\ current_date_timestamp.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index c770227..91cccfb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -100,6 +100,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -751,8 +753,14 @@ public class StatsRulesProcFactory { } } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) { + int value; + if (colTypeLowerCase == serdeConstants.DATE_TYPE_NAME) { + DateWritable writableVal = new DateWritable(java.sql.Date.valueOf(boundValue)); + value = writableVal.getDays(); + } else { + value = new Integer(boundValue); + } // Date is an integer internally - int value = new Integer(boundValue); int maxValue = cs.getRange().maxValue.intValue(); int minValue = cs.getRange().minValue.intValue(); if (upperBound) { http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/queries/clientpositive/colstats_date_min_max.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/colstats_date_min_max.q b/ql/src/test/queries/clientpositive/colstats_date_min_max.q new file mode 100644 index 0000000..7f5be6a --- /dev/null +++ b/ql/src/test/queries/clientpositive/colstats_date_min_max.q @@ -0,0 +1,30 @@ +set hive.explain.user=true; + +create table d1(d date); +-- tblproperties('transactional'='false'); + +insert into d1 values + ('2010-10-01'), + ('2010-10-02'), + ('2010-10-03'), + ('2010-10-04'), + ('2010-10-05'), + ('2010-10-06'), + ('2010-10-07'), + ('2010-10-08'), + ('2010-10-09'), + ('2010-10-10'); + +analyze table d1 compute statistics for columns; + +desc formatted d1; +desc formatted d1 d; + +explain +select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01'; + +explain +select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03'; + +explain +select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03'; http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out b/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out new file mode 100644 index 0000000..7754f3e --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out @@ -0,0 +1,193 @@ +PREHOOK: query: create table d1(d date) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@d1 +POSTHOOK: query: create table d1(d date) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@d1 +PREHOOK: query: insert into d1 values + ('2010-10-01'), + ('2010-10-02'), + ('2010-10-03'), + ('2010-10-04'), + ('2010-10-05'), + ('2010-10-06'), + ('2010-10-07'), + ('2010-10-08'), + ('2010-10-09'), + ('2010-10-10') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@d1 +POSTHOOK: query: insert into d1 values + ('2010-10-01'), + ('2010-10-02'), + ('2010-10-03'), + ('2010-10-04'), + ('2010-10-05'), + ('2010-10-06'), + ('2010-10-07'), + ('2010-10-08'), + ('2010-10-09'), + ('2010-10-10') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@d1 +POSTHOOK: Lineage: d1.d SCRIPT [] +PREHOOK: query: analyze table d1 compute statistics for columns +PREHOOK: type: ANALYZE_TABLE +PREHOOK: Input: default@d1 +PREHOOK: Output: default@d1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table d1 compute statistics for columns +POSTHOOK: type: ANALYZE_TABLE +POSTHOOK: Input: default@d1 +POSTHOOK: Output: default@d1 +#### A masked pattern was here #### +PREHOOK: query: desc formatted d1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@d1 +POSTHOOK: query: desc formatted d1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@d1 +# col_name data_type comment +d date + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}} + bucketing_version 2 + numFiles 1 + numRows 10 + rawDataSize 100 + totalSize 110 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted d1 d +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@d1 +POSTHOOK: query: desc formatted d1 d +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@d1 +col_name d +data_type date +min 2010-10-01 +max 2010-10-10 +num_nulls 0 +distinct_count 10 +avg_col_len +max_col_len +num_trues +num_falses +bitVector HL +comment from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}} +PREHOOK: query: explain +select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01' +PREHOOK: type: QUERY +POSTHOOK: query: explain +select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01' +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_15] + Select Operator [SEL_14] (rows=1 width=110) + Output:["_col0","_col1"] + Group By Operator [GBY_13] (rows=1 width=8) + Output:["_col0"],aggregations:["count(VALUE._col0)"] + <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_12] + Group By Operator [GBY_11] (rows=1 width=8) + Output:["_col0"],aggregations:["count()"] + Select Operator [SEL_10] (rows=1 width=56) + Filter Operator [FIL_9] (rows=1 width=56) + predicate:(d < DATE'2010-03-01') + TableScan [TS_0] (rows=10 width=56) + default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"] + +PREHOOK: query: explain +select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03' +PREHOOK: type: QUERY +POSTHOOK: query: explain +select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03' +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_15] + Select Operator [SEL_14] (rows=1 width=121) + Output:["_col0","_col1"] + Group By Operator [GBY_13] (rows=1 width=8) + Output:["_col0"],aggregations:["count(VALUE._col0)"] + <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_12] + Group By Operator [GBY_11] (rows=1 width=8) + Output:["_col0"],aggregations:["count()"] + Select Operator [SEL_10] (rows=3 width=56) + Filter Operator [FIL_9] (rows=3 width=56) + predicate:(d < DATE'2010-10-03') + TableScan [TS_0] (rows=10 width=56) + default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"] + +PREHOOK: query: explain +select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03' +PREHOOK: type: QUERY +POSTHOOK: query: explain +select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03' +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_15] + Select Operator [SEL_14] (rows=1 width=120) + Output:["_col0","_col1"] + Group By Operator [GBY_13] (rows=1 width=8) + Output:["_col0"],aggregations:["count(VALUE._col0)"] + <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_12] + Group By Operator [GBY_11] (rows=1 width=8) + Output:["_col0"],aggregations:["count()"] + Select Operator [SEL_10] (rows=10 width=56) + Filter Operator [FIL_9] (rows=10 width=56) + predicate:(d < DATE'2010-11-03') + TableScan [TS_0] (rows=10 width=56) + default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"] + http://git-wip-us.apache.org/repos/asf/hive/blob/0b2d364a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out index 6093beb..f76053e 100644 --- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out @@ -465,7 +465,7 @@ STAGE PLANS: native: true predicateExpression: FilterLongColumnBetween(col 3:date, left -2, right 1) predicate: cdate BETWEEN DATE'1969-12-30' AND DATE'1970-01-02' (type: boolean) - Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) outputColumnNames: _col0 @@ -473,7 +473,7 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [3] - Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: date) sort order: + @@ -481,7 +481,7 @@ STAGE PLANS: className: VectorReduceSinkObjectHashOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs Map Vectorization: @@ -509,13 +509,13 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -560,7 +560,7 @@ STAGE PLANS: native: true predicateExpression: FilterLongColumnNotBetween(col 3:date, left -610, right 608) predicate: cdate NOT BETWEEN DATE'1968-05-01' AND DATE'1971-09-01' (type: boolean) - Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) outputColumnNames: _col0 @@ -568,7 +568,7 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [3] - Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: date) sort order: + @@ -576,7 +576,7 @@ STAGE PLANS: className: VectorReduceSinkObjectHashOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs Map Vectorization: @@ -604,13 +604,13 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat