Repository: hive Updated Branches: refs/heads/branch-3 32888e82c -> 603b0f64c
HIVE-19460: Improve stats estimations for NOT IN operator (Zoltan Haindrich reviewed by Ashutosh Chauhan) Signed-off-by: Zoltan Haindrich <k...@rxd.hu> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/603b0f64 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/603b0f64 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/603b0f64 Branch: refs/heads/branch-3 Commit: 603b0f64cb95dc381a9fa050c9d25ba4d709166d Parents: 32888e8 Author: Zoltan Haindrich <k...@rxd.hu> Authored: Tue May 29 12:48:53 2018 +0200 Committer: Zoltan Haindrich <k...@rxd.hu> Committed: Tue May 29 12:48:53 2018 +0200 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../stats/annotation/StatsRulesProcFactory.java | 158 ++++++++++++++++++- .../hadoop/hive/ql/plan/ColStatistics.java | 1 - .../ql/plan/mapping/TestStatEstimations.java | 113 +++++++++++++ .../clientpositive/llap/acid_no_buckets.q.out | 20 +-- .../clientpositive/llap/explainuser_2.q.out | 26 +-- .../clientpositive/llap/vector_between_in.q.out | 14 +- .../clientpositive/llap/vector_struct_in.q.out | 6 +- .../clientpositive/llap/vectorization_0.q.out | 16 +- 9 files changed, 312 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 72336ab..66c2831 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2373,6 +2373,8 @@ public class HiveConf extends Configuration { "in the number of rows filtered by a certain operator, which in turn might lead to overprovision or\n" + "underprovision of resources. This factor is applied to the cardinality estimation of IN clauses in\n" + "filter operators."), + HIVE_STATS_IN_MIN_RATIO("hive.stats.filter.in.min.ratio", (float) 0.05, + "Output estimation of an IN filter can't be lower than this ratio"), // Concurrency HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false, "Whether Hive supports concurrency control or not. \n" + http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 91cccfb..d0be33b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -19,8 +19,8 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; import java.lang.reflect.Field; -import java.util.Arrays; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -30,7 +30,6 @@ import java.util.Map.Entry; import java.util.Optional; import java.util.Set; import java.util.Stack; - import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.Context; @@ -60,6 +59,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; @@ -494,6 +494,19 @@ public class StatsRulesProcFactory { } } + boolean allColsFilteredByStats = true; + for (int i = 0; i < columnStats.size(); i++) { + ValuePruner vp = new ValuePruner(columnStats.get(i)); + allColsFilteredByStats &= vp.isValid(); + Set<ExprNodeDescEqualityWrapper> newValues = Sets.newHashSet(); + for (ExprNodeDescEqualityWrapper v : values.get(i)) { + if (vp.accept(v)) { + newValues.add(v); + } + } + values.set(i, newValues); + } + // 3. Calculate IN selectivity double factor = 1d; for (int i = 0; i < columnStats.size(); i++) { @@ -503,10 +516,151 @@ public class StatsRulesProcFactory { // max can be 1, even when ndv is larger in IN clause than in column stats factor *= columnFactor > 1d ? 1d : columnFactor; } + if (!allColsFilteredByStats) { + factor = Double.max(factor, HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_MIN_RATIO)); + } float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); return Math.round( numRows * factor * inFactor); } + static class RangeOps { + + private String colType; + private Range range; + + public RangeOps(String colType, Range range) { + this.colType = colType; + this.range = range; + } + + public static RangeOps build(String colType, Range range) { + if (range == null || range.minValue == null || range.maxValue == null) { + return null; + } + return new RangeOps(colType, range); + } + + enum RangeResult { + BELOW, AT_MIN, BETWEEN, AT_MAX, ABOVE; + + public static RangeResult of(boolean ltMin, boolean ltMax, boolean eqMin, boolean eqMax) { + if (ltMin) { + return RangeResult.BELOW; + } + if (eqMin) { + return RangeResult.AT_MIN; + } + if (ltMax) { + return RangeResult.BETWEEN; + } + if (eqMax) { + return AT_MAX; + } + return ABOVE; + } + } + + public boolean contains(ExprNodeDesc exprNode) { + RangeResult intersection = intersect(exprNode); + return intersection != RangeResult.ABOVE && intersection != RangeResult.BELOW; + } + + public RangeResult intersect(ExprNodeDesc exprNode) { + if (!(exprNode instanceof ExprNodeConstantDesc)) { + return null; + } + try { + + ExprNodeConstantDesc constantDesc = (ExprNodeConstantDesc) exprNode; + + String stringVal = constantDesc.getValue().toString(); + + @Deprecated + String boundValue = stringVal; + switch (colType) { + case serdeConstants.TINYINT_TYPE_NAME: { + byte value = new Byte(stringVal); + byte maxValue = range.maxValue.byteValue(); + byte minValue = range.minValue.byteValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.SMALLINT_TYPE_NAME: { + short value = new Short(boundValue); + short maxValue = range.maxValue.shortValue(); + short minValue = range.minValue.shortValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.DATE_TYPE_NAME: { + DateWritable dateWriteable = new DateWritable(java.sql.Date.valueOf(boundValue)); + int value = dateWriteable.getDays(); + int maxValue = range.maxValue.intValue(); + int minValue = range.minValue.intValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.INT_TYPE_NAME: { + int value = new Integer(boundValue); + int maxValue = range.maxValue.intValue(); + int minValue = range.minValue.intValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.BIGINT_TYPE_NAME: { + long value = new Long(boundValue); + long maxValue = range.maxValue.longValue(); + long minValue = range.minValue.longValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.FLOAT_TYPE_NAME: { + float value = new Float(boundValue); + float maxValue = range.maxValue.floatValue(); + float minValue = range.minValue.floatValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + case serdeConstants.DOUBLE_TYPE_NAME: { + double value = new Double(boundValue); + double maxValue = range.maxValue.doubleValue(); + double minValue = range.minValue.doubleValue(); + return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue); + } + default: + return null; + } + } catch (Exception e) { + // NumberFormatException value out of range + // other unknown cases + return null; + } + } + + } + + private static class ValuePruner { + + private boolean valid; + private RangeOps colRange; + + ValuePruner(ColStatistics colStatistics) { + if (colStatistics == null) { + valid = false; + return; + } + colRange = RangeOps.build(colStatistics.getColumnType(), colStatistics.getRange()); + if (colRange == null) { + valid = false; + return; + } + valid = true; + } + + public boolean isValid() { + return valid; + } + + public boolean accept(ExprNodeDescEqualityWrapper e) { + /** removes all values which are outside of the scope of the column */ + return !valid || colRange.contains(e.getExprNodeDesc()); + } + } + private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx, List<String> neededCols, Operator<?> op) throws SemanticException { final ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred; http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java index 106e59f..a31f965 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java @@ -196,5 +196,4 @@ public class ColStatistics { return sb.toString(); } } - } http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java new file mode 100644 index 0000000..e5233ce --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.plan.mapping; + +import static org.junit.Assert.assertEquals; +import java.util.List; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.ql.DriverFactory; +import org.apache.hadoop.hive.ql.IDriver; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.plan.mapper.PlanMapper; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hive.testutils.HiveTestEnvSetup; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestRule; + +public class TestStatEstimations { + + @ClassRule + public static HiveTestEnvSetup env_setup = new HiveTestEnvSetup(); + + @Rule + public TestRule methodRule = env_setup.getMethodRule(); + + @BeforeClass + public static void beforeClass() throws Exception { + IDriver driver = createDriver(); + dropTables(driver); + String cmds[] = { + // @formatter:off + "create table t2(a integer, b string) STORED AS ORC", + "insert into t2 values(1, 'AAA'),(2, 'AAA'),(3, 'AAA'),(4, 'AAA'),(5, 'AAA')," + + "(6, 'BBB'),(7, 'BBB'),(8, 'BBB'),(9, 'BBB'),(10, 'BBB')", + "analyze table t2 compute statistics for columns" + // @formatter:on + }; + for (String cmd : cmds) { + int ret = driver.run(cmd).getResponseCode(); + assertEquals("Checking command success", 0, ret); + } + } + + @AfterClass + public static void afterClass() throws Exception { + IDriver driver = createDriver(); + dropTables(driver); + } + + public static void dropTables(IDriver driver) throws Exception { + String tables[] = {"t2" }; + for (String t : tables) { + int ret = driver.run("drop table if exists " + t).getResponseCode(); + assertEquals("Checking command success", 0, ret); + } + } + + private PlanMapper getMapperForQuery(IDriver driver, String query) { + int ret = driver.run(query).getResponseCode(); + assertEquals("Checking command success", 0, ret); + PlanMapper pm0 = driver.getContext().getPlanMapper(); + return pm0; + } + + @Test + public void testFilterIntIn() throws ParseException { + IDriver driver = createDriver(); + String query = "explain select a from t2 where a IN (-1,0,1,2,10,20,30,40) order by a"; + + PlanMapper pm = getMapperForQuery(driver, query); + List<FilterOperator> fos = pm.getAll(FilterOperator.class); + // the same operator is present 2 times + fos.sort(TestCounterMapping.OPERATOR_ID_COMPARATOR.reversed()); + assertEquals(1, fos.size()); + FilterOperator fop = fos.get(0); + + // all outside elements should be ignored from stat estimation + assertEquals(3, fop.getStatistics().getNumRows()); + + } + + private static IDriver createDriver() { + HiveConf conf = env_setup.getTestCtx().hiveConf; + + conf.setBoolVar(ConfVars.HIVE_VECTORIZATION_ENABLED, false); + conf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + SessionState.start(conf); + + IDriver driver = DriverFactory.newDriver(conf); + return driver; + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out index f03e64b..eb4a8cb 100644 --- a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out +++ b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out @@ -304,15 +304,15 @@ STAGE PLANS: Statistics: Num rows: 2015 Data size: 916825 Basic stats: COMPLETE Column stats: PARTIAL Filter Operator predicate: (key) IN ('1001', '213', '43') (type: boolean) - Statistics: Num rows: 20 Data size: 9100 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 45955 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL Reduce Output Operator key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>) sort order: + - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col1 (type: string), _col2 (type: string) Execution mode: llap LLAP IO: may be used (ACID table) @@ -322,10 +322,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat @@ -804,16 +804,16 @@ STAGE PLANS: Statistics: Num rows: 2015 Data size: 916825 Basic stats: COMPLETE Column stats: PARTIAL Filter Operator predicate: (key) IN ('1001', '213', '43') (type: boolean) - Statistics: Num rows: 20 Data size: 9100 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 45955 Basic stats: COMPLETE Column stats: PARTIAL Select Operator expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL Reduce Output Operator key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>) sort order: + Map-reduce partition columns: UDFToInteger(_col0) (type: int) - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col1 (type: string), _col2 (type: string) Execution mode: llap LLAP IO: may be used (ACID table) @@ -823,10 +823,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL File Output Operator compressed: false - Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/explainuser_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out index 3930a14..361dc07 100644 --- a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out +++ b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out @@ -304,39 +304,39 @@ Stage-0 Stage-1 Reducer 5 vectorized, llap File Output Operator [FS_126] - Limit [LIM_125] (rows=5 width=285) + Limit [LIM_125] (rows=12 width=285) Number of rows:100 - Select Operator [SEL_124] (rows=5 width=285) + Select Operator [SEL_124] (rows=12 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] <-Reducer 4 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_123] - Group By Operator [GBY_122] (rows=5 width=285) + Group By Operator [GBY_122] (rows=12 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0, KEY._col1, KEY._col2 <-Reducer 3 [SIMPLE_EDGE] llap SHUFFLE [RS_49] PartitionCols:_col0, _col1, _col2 - Group By Operator [GBY_48] (rows=5 width=285) + Group By Operator [GBY_48] (rows=12 width=285) Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col11)","count(_col21)","count(_col3)"],keys:_col10, _col20, _col2 - Merge Join Operator [MERGEJOIN_97] (rows=4704 width=534) + Merge Join Operator [MERGEJOIN_97] (rows=9275 width=534) Conds:RS_44._col1, _col3=RS_45._col15, _col17(Inner),Output:["_col2","_col3","_col10","_col11","_col20","_col21"] <-Reducer 10 [SIMPLE_EDGE] llap SHUFFLE [RS_45] PartitionCols:_col15, _col17 - Select Operator [SEL_40] (rows=336 width=447) + Select Operator [SEL_40] (rows=420 width=447) Output:["_col4","_col5","_col14","_col15","_col17"] - Merge Join Operator [MERGEJOIN_96] (rows=336 width=447) + Merge Join Operator [MERGEJOIN_96] (rows=420 width=447) Conds:RS_37._col4, _col2=RS_38._col4, _col2(Inner),Output:["_col0","_col1","_col14","_col15","_col17"] <-Reducer 11 [SIMPLE_EDGE] llap SHUFFLE [RS_38] PartitionCols:_col4, _col2 - Merge Join Operator [MERGEJOIN_95] (rows=8 width=356) + Merge Join Operator [MERGEJOIN_95] (rows=10 width=356) Conds:RS_121._col0=RS_109._col0(Inner),Output:["_col2","_col3","_col4","_col5"] <-Map 6 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_109] PartitionCols:_col0 - Select Operator [SEL_106] (rows=5 width=178) + Select Operator [SEL_106] (rows=25 width=178) Output:["_col0"] - Filter Operator [FIL_103] (rows=5 width=178) + Filter Operator [FIL_103] (rows=25 width=178) predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null) TableScan [TS_3] (rows=500 width=178) default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] @@ -402,14 +402,14 @@ Stage-0 <-Reducer 2 [SIMPLE_EDGE] llap SHUFFLE [RS_44] PartitionCols:_col1, _col3 - Merge Join Operator [MERGEJOIN_91] (rows=70 width=269) + Merge Join Operator [MERGEJOIN_91] (rows=265 width=269) Conds:RS_100._col0=RS_107._col0(Inner),Output:["_col1","_col2","_col3"] <-Map 6 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_107] PartitionCols:_col0 - Select Operator [SEL_104] (rows=5 width=178) + Select Operator [SEL_104] (rows=25 width=178) Output:["_col0"] - Filter Operator [FIL_101] (rows=5 width=178) + Filter Operator [FIL_101] (rows=25 width=178) predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null) Please refer to the previous TableScan [TS_3] <-Map 1 [SIMPLE_EDGE] vectorized, llap http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vector_between_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out index f76053e..b1c0bab 100644 --- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out @@ -57,7 +57,7 @@ STAGE PLANS: native: true predicateExpression: FilterLongColumnInList(col 3:date, values [-67, -171]) predicate: (cdate) IN (DATE'1969-10-26', DATE'1969-07-14') (type: boolean) - Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: cdate (type: date) outputColumnNames: _col0 @@ -65,7 +65,7 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [3] - Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: date) sort order: + @@ -73,7 +73,7 @@ STAGE PLANS: className: VectorReduceSinkObjectHashOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs Map Vectorization: @@ -101,13 +101,13 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0] - Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -152,13 +152,13 @@ STAGE PLANS: native: true predicateExpression: SelectColumnIsFalse(col 5:boolean)(children: LongColumnInList(col 3, values [-67, -171, 20]) -> 5:boolean) predicate: (not (cdate) IN (DATE'1969-10-26', DATE'1969-07-14', DATE'1970-01-21')) (type: boolean) - Statistics: Num rows: 12274 Data size: 653057 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12284 Data size: 653589 Basic stats: COMPLETE Column stats: NONE Select Operator Select Vectorization: className: VectorSelectOperator native: true projectedOutputColumnNums: [] - Statistics: Num rows: 12274 Data size: 653057 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12284 Data size: 653589 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() Group By Vectorization: http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out index 5afa99d..f210b72 100644 --- a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out @@ -847,7 +847,7 @@ STAGE PLANS: native: true predicateExpression: FilterStructColumnInList(structExpressions [col 0:bigint, col 1:string, col 2:double], fieldVectorColumnTypes [LONG, BYTES, DOUBLE], structColumnMap [0, 1, 2]) predicate: (struct(my_bigint,my_string,my_double)) IN (const struct(1L,'a',1.5D), const struct(1L,'b',-0.5D), const struct(3L,'b',1.5D), const struct(1L,'d',1.5D), const struct(1L,'c',1.5D), const struct(1L,'b',2.5D), const struct(1L,'b',0.5D), const struct(5L,'b',1.5D), const struct(1L,'a',0.5D), const struct(3L,'b',1.5D)) (type: boolean) - Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: my_bigint (type: bigint), my_string (type: string), my_double (type: double) outputColumnNames: _col0, _col1, _col2 @@ -855,13 +855,13 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0, 1, 2] - Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vectorization_0.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out index c3d810e..3d00bbe 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out @@ -30975,19 +30975,19 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (cstring1) IN ('biology', 'history', 'topology') (type: boolean) - Statistics: Num rows: 6 Data size: 470 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 614 Data size: 43146 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() keys: cstring1 (type: string) mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) null sort order: a sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col1 (type: bigint) auto parallelism: true @@ -31055,16 +31055,16 @@ STAGE PLANS: keys: KEY._col0 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col1 (type: bigint), _col0 (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col1 (type: string) null sort order: a sort order: + - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false @@ -31075,13 +31075,13 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat