hive git commit: HIVE-19460: Improve stats estimations for NOT IN operator (Zoltan Haindrich reviewed by Ashutosh Chauhan)

kgyrtkirk Tue, 29 May 2018 03:49:40 -0700

Repository: hive
Updated Branches:
  refs/heads/branch-3 32888e82c -> 603b0f64c



HIVE-19460: Improve stats estimations for NOT IN operator (Zoltan Haindrich 
reviewed by Ashutosh Chauhan)

Signed-off-by: Zoltan Haindrich <k...@rxd.hu>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/603b0f64
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/603b0f64
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/603b0f64

Branch: refs/heads/branch-3
Commit: 603b0f64cb95dc381a9fa050c9d25ba4d709166d
Parents: 32888e8
Author: Zoltan Haindrich <k...@rxd.hu>
Authored: Tue May 29 12:48:53 2018 +0200
Committer: Zoltan Haindrich <k...@rxd.hu>
Committed: Tue May 29 12:48:53 2018 +0200

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   2 +
 .../stats/annotation/StatsRulesProcFactory.java | 158 ++++++++++++++++++-
 .../hadoop/hive/ql/plan/ColStatistics.java      |   1 -
 .../ql/plan/mapping/TestStatEstimations.java    | 113 +++++++++++++
 .../clientpositive/llap/acid_no_buckets.q.out   |  20 +--
 .../clientpositive/llap/explainuser_2.q.out     |  26 +--
 .../clientpositive/llap/vector_between_in.q.out |  14 +-
 .../clientpositive/llap/vector_struct_in.q.out  |   6 +-
 .../clientpositive/llap/vectorization_0.q.out   |  16 +-
 9 files changed, 312 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 72336ab..66c2831 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2373,6 +2373,8 @@ public class HiveConf extends Configuration {
         "in the number of rows filtered by a certain operator, which in turn 
might lead to overprovision or\n" +
         "underprovision of resources. This factor is applied to the 
cardinality estimation of IN clauses in\n" +
         "filter operators."),
+    HIVE_STATS_IN_MIN_RATIO("hive.stats.filter.in.min.ratio", (float) 0.05,
+        "Output estimation of an IN filter can't be lower than this ratio"),
     // Concurrency
     HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false,
         "Whether Hive supports concurrency control or not. \n" +

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 91cccfb..d0be33b 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -19,8 +19,8 @@
 package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
 
 import java.lang.reflect.Field;
-import java.util.Arrays;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -30,7 +30,6 @@ import java.util.Map.Entry;
 import java.util.Optional;
 import java.util.Set;
 import java.util.Stack;
-
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.ql.Context;
@@ -60,6 +59,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.AggregationDesc;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
@@ -494,6 +494,19 @@ public class StatsRulesProcFactory {
         }
       }
 
+      boolean allColsFilteredByStats = true;
+      for (int i = 0; i < columnStats.size(); i++) {
+        ValuePruner vp = new ValuePruner(columnStats.get(i));
+        allColsFilteredByStats &= vp.isValid();
+        Set<ExprNodeDescEqualityWrapper> newValues = Sets.newHashSet();
+        for (ExprNodeDescEqualityWrapper v : values.get(i)) {
+          if (vp.accept(v)) {
+            newValues.add(v);
+          }
+        }
+        values.set(i, newValues);
+      }
+
       // 3. Calculate IN selectivity
       double factor = 1d;
       for (int i = 0; i < columnStats.size(); i++) {
@@ -503,10 +516,151 @@ public class StatsRulesProcFactory {
         // max can be 1, even when ndv is larger in IN clause than in column 
stats
         factor *= columnFactor > 1d ? 1d : columnFactor;
       }
+      if (!allColsFilteredByStats) {
+        factor = Double.max(factor, HiveConf.getFloatVar(aspCtx.getConf(), 
HiveConf.ConfVars.HIVE_STATS_IN_MIN_RATIO));
+      }
       float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), 
HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
       return Math.round( numRows * factor * inFactor);
     }
 
+    static class RangeOps {
+
+      private String colType;
+      private Range range;
+
+      public RangeOps(String colType, Range range) {
+        this.colType = colType;
+        this.range = range;
+      }
+
+      public static RangeOps build(String colType, Range range) {
+        if (range == null || range.minValue == null || range.maxValue == null) 
{
+          return null;
+        }
+        return new RangeOps(colType, range);
+      }
+
+      enum RangeResult {
+        BELOW, AT_MIN, BETWEEN, AT_MAX, ABOVE;
+
+        public static RangeResult of(boolean ltMin, boolean ltMax, boolean 
eqMin, boolean eqMax) {
+          if (ltMin) {
+            return RangeResult.BELOW;
+          }
+          if (eqMin) {
+            return RangeResult.AT_MIN;
+          }
+          if (ltMax) {
+            return RangeResult.BETWEEN;
+          }
+          if (eqMax) {
+            return AT_MAX;
+          }
+          return ABOVE;
+        }
+      }
+
+      public boolean contains(ExprNodeDesc exprNode) {
+        RangeResult intersection = intersect(exprNode);
+        return intersection != RangeResult.ABOVE && intersection != 
RangeResult.BELOW;
+      }
+
+      public RangeResult intersect(ExprNodeDesc exprNode) {
+        if (!(exprNode instanceof ExprNodeConstantDesc)) {
+          return null;
+        }
+        try {
+
+          ExprNodeConstantDesc constantDesc = (ExprNodeConstantDesc) exprNode;
+
+          String stringVal = constantDesc.getValue().toString();
+
+          @Deprecated
+          String boundValue = stringVal;
+          switch (colType) {
+          case serdeConstants.TINYINT_TYPE_NAME: {
+            byte value = new Byte(stringVal);
+            byte maxValue = range.maxValue.byteValue();
+            byte minValue = range.minValue.byteValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.SMALLINT_TYPE_NAME: {
+            short value = new Short(boundValue);
+            short maxValue = range.maxValue.shortValue();
+            short minValue = range.minValue.shortValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.DATE_TYPE_NAME: {
+            DateWritable dateWriteable = new 
DateWritable(java.sql.Date.valueOf(boundValue));
+            int value = dateWriteable.getDays();
+            int maxValue = range.maxValue.intValue();
+            int minValue = range.minValue.intValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.INT_TYPE_NAME: {
+            int value = new Integer(boundValue);
+            int maxValue = range.maxValue.intValue();
+            int minValue = range.minValue.intValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.BIGINT_TYPE_NAME: {
+            long value = new Long(boundValue);
+            long maxValue = range.maxValue.longValue();
+            long minValue = range.minValue.longValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.FLOAT_TYPE_NAME: {
+            float value = new Float(boundValue);
+            float maxValue = range.maxValue.floatValue();
+            float minValue = range.minValue.floatValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          case serdeConstants.DOUBLE_TYPE_NAME: {
+            double value = new Double(boundValue);
+            double maxValue = range.maxValue.doubleValue();
+            double minValue = range.minValue.doubleValue();
+            return RangeResult.of(value < minValue, value < maxValue, value == 
minValue, value == maxValue);
+          }
+          default:
+            return null;
+          }
+        } catch (Exception e) {
+          // NumberFormatException value out of range
+          // other unknown cases
+          return null;
+        }
+      }
+
+    }
+
+    private static class ValuePruner {
+
+      private boolean valid;
+      private RangeOps colRange;
+
+      ValuePruner(ColStatistics colStatistics) {
+        if (colStatistics == null) {
+          valid = false;
+          return;
+        }
+        colRange = RangeOps.build(colStatistics.getColumnType(), 
colStatistics.getRange());
+        if (colRange == null) {
+          valid = false;
+          return;
+        }
+        valid = true;
+      }
+
+      public boolean isValid() {
+        return valid;
+      }
+
+      public boolean accept(ExprNodeDescEqualityWrapper e) {
+        /** removes all values which are outside of the scope of the column */
+        return !valid || colRange.contains(e.getExprNodeDesc());
+      }
+    }
+
     private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long 
currNumRows, AnnotateStatsProcCtx aspCtx,
             List<String> neededCols, Operator<?> op) throws SemanticException {
       final ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred;

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
index 106e59f..a31f965 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
@@ -196,5 +196,4 @@ public class ColStatistics {
       return sb.toString();
     }
   }
-
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java 
b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
new file mode 100644
index 0000000..e5233ce
--- /dev/null
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.plan.mapping;
+
+import static org.junit.Assert.assertEquals;
+import java.util.List;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.ql.DriverFactory;
+import org.apache.hadoop.hive.ql.IDriver;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.parse.ParseException;
+import org.apache.hadoop.hive.ql.plan.mapper.PlanMapper;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hive.testutils.HiveTestEnvSetup;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestRule;
+
+public class TestStatEstimations {
+
+  @ClassRule
+  public static HiveTestEnvSetup env_setup = new HiveTestEnvSetup();
+
+  @Rule
+  public TestRule methodRule = env_setup.getMethodRule();
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    IDriver driver = createDriver();
+    dropTables(driver);
+    String cmds[] = {
+        // @formatter:off
+        "create table t2(a integer, b string) STORED AS ORC",
+        "insert into t2 values(1, 'AAA'),(2, 'AAA'),(3, 'AAA'),(4, 'AAA'),(5, 
'AAA')," +
+                              "(6, 'BBB'),(7, 'BBB'),(8, 'BBB'),(9, 
'BBB'),(10, 'BBB')",
+        "analyze table t2 compute statistics for columns"
+        // @formatter:on
+    };
+    for (String cmd : cmds) {
+      int ret = driver.run(cmd).getResponseCode();
+      assertEquals("Checking command success", 0, ret);
+    }
+  }
+
+  @AfterClass
+  public static void afterClass() throws Exception {
+    IDriver driver = createDriver();
+    dropTables(driver);
+  }
+
+  public static void dropTables(IDriver driver) throws Exception {
+    String tables[] = {"t2" };
+    for (String t : tables) {
+      int ret = driver.run("drop table if exists " + t).getResponseCode();
+      assertEquals("Checking command success", 0, ret);
+    }
+  }
+
+  private PlanMapper getMapperForQuery(IDriver driver, String query) {
+    int ret = driver.run(query).getResponseCode();
+    assertEquals("Checking command success", 0, ret);
+    PlanMapper pm0 = driver.getContext().getPlanMapper();
+    return pm0;
+  }
+
+  @Test
+  public void testFilterIntIn() throws ParseException {
+    IDriver driver = createDriver();
+    String query = "explain select a from t2 where a IN (-1,0,1,2,10,20,30,40) 
order by a";
+
+    PlanMapper pm = getMapperForQuery(driver, query);
+    List<FilterOperator> fos = pm.getAll(FilterOperator.class);
+    // the same operator is present 2 times
+    fos.sort(TestCounterMapping.OPERATOR_ID_COMPARATOR.reversed());
+    assertEquals(1, fos.size());
+    FilterOperator fop = fos.get(0);
+
+    // all outside elements should be ignored from stat estimation
+    assertEquals(3, fop.getStatistics().getNumRows());
+
+  }
+
+  private static IDriver createDriver() {
+    HiveConf conf = env_setup.getTestCtx().hiveConf;
+
+    conf.setBoolVar(ConfVars.HIVE_VECTORIZATION_ENABLED, false);
+    conf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER,
+        
"org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
+    SessionState.start(conf);
+
+    IDriver driver = DriverFactory.newDriver(conf);
+    return driver;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out 
b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
index f03e64b..eb4a8cb 100644
--- a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
+++ b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
@@ -304,15 +304,15 @@ STAGE PLANS:
                   Statistics: Num rows: 2015 Data size: 916825 Basic stats: 
COMPLETE Column stats: PARTIAL
                   Filter Operator
                     predicate: (key) IN ('1001', '213', '43') (type: boolean)
-                    Statistics: Num rows: 20 Data size: 9100 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Statistics: Num rows: 101 Data size: 45955 Basic stats: 
COMPLETE Column stats: PARTIAL
                     Select Operator
                       expressions: ROW__ID (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: 
string)
                       outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                       Reduce Output Operator
                         key expressions: _col0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>)
                         sort order: +
-                        Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                        Statistics: Num rows: 101 Data size: 44844 Basic 
stats: COMPLETE Column stats: PARTIAL
                         value expressions: _col1 (type: string), _col2 (type: 
string)
             Execution mode: llap
             LLAP IO: may be used (ACID table)
@@ -322,10 +322,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), 
VALUE._col1 (type: string)
                 outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE 
Column stats: PARTIAL
+                Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                   table:
                       input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
@@ -804,16 +804,16 @@ STAGE PLANS:
                   Statistics: Num rows: 2015 Data size: 916825 Basic stats: 
COMPLETE Column stats: PARTIAL
                   Filter Operator
                     predicate: (key) IN ('1001', '213', '43') (type: boolean)
-                    Statistics: Num rows: 20 Data size: 9100 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Statistics: Num rows: 101 Data size: 45955 Basic stats: 
COMPLETE Column stats: PARTIAL
                     Select Operator
                       expressions: ROW__ID (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: 
string)
                       outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                       Reduce Output Operator
                         key expressions: _col0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>)
                         sort order: +
                         Map-reduce partition columns: UDFToInteger(_col0) 
(type: int)
-                        Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                        Statistics: Num rows: 101 Data size: 44844 Basic 
stats: COMPLETE Column stats: PARTIAL
                         value expressions: _col1 (type: string), _col2 (type: 
string)
             Execution mode: llap
             LLAP IO: may be used (ACID table)
@@ -823,10 +823,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: 
struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), 
VALUE._col1 (type: string)
                 outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE 
Column stats: PARTIAL
+                Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 20 Data size: 8880 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Statistics: Num rows: 101 Data size: 44844 Basic stats: 
COMPLETE Column stats: PARTIAL
                   table:
                       input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out 
b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
index 3930a14..361dc07 100644
--- a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
+++ b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
@@ -304,39 +304,39 @@ Stage-0
     Stage-1
       Reducer 5 vectorized, llap
       File Output Operator [FS_126]
-        Limit [LIM_125] (rows=5 width=285)
+        Limit [LIM_125] (rows=12 width=285)
           Number of rows:100
-          Select Operator [SEL_124] (rows=5 width=285)
+          Select Operator [SEL_124] (rows=12 width=285)
             Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
           <-Reducer 4 [SIMPLE_EDGE] vectorized, llap
             SHUFFLE [RS_123]
-              Group By Operator [GBY_122] (rows=5 width=285)
+              Group By Operator [GBY_122] (rows=12 width=285)
                 
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0,
 KEY._col1, KEY._col2
               <-Reducer 3 [SIMPLE_EDGE] llap
                 SHUFFLE [RS_49]
                   PartitionCols:_col0, _col1, _col2
-                  Group By Operator [GBY_48] (rows=5 width=285)
+                  Group By Operator [GBY_48] (rows=12 width=285)
                     
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col11)","count(_col21)","count(_col3)"],keys:_col10,
 _col20, _col2
-                    Merge Join Operator [MERGEJOIN_97] (rows=4704 width=534)
+                    Merge Join Operator [MERGEJOIN_97] (rows=9275 width=534)
                       Conds:RS_44._col1, _col3=RS_45._col15, 
_col17(Inner),Output:["_col2","_col3","_col10","_col11","_col20","_col21"]
                     <-Reducer 10 [SIMPLE_EDGE] llap
                       SHUFFLE [RS_45]
                         PartitionCols:_col15, _col17
-                        Select Operator [SEL_40] (rows=336 width=447)
+                        Select Operator [SEL_40] (rows=420 width=447)
                           Output:["_col4","_col5","_col14","_col15","_col17"]
-                          Merge Join Operator [MERGEJOIN_96] (rows=336 
width=447)
+                          Merge Join Operator [MERGEJOIN_96] (rows=420 
width=447)
                             Conds:RS_37._col4, _col2=RS_38._col4, 
_col2(Inner),Output:["_col0","_col1","_col14","_col15","_col17"]
                           <-Reducer 11 [SIMPLE_EDGE] llap
                             SHUFFLE [RS_38]
                               PartitionCols:_col4, _col2
-                              Merge Join Operator [MERGEJOIN_95] (rows=8 
width=356)
+                              Merge Join Operator [MERGEJOIN_95] (rows=10 
width=356)
                                 
Conds:RS_121._col0=RS_109._col0(Inner),Output:["_col2","_col3","_col4","_col5"]
                               <-Map 6 [SIMPLE_EDGE] vectorized, llap
                                 SHUFFLE [RS_109]
                                   PartitionCols:_col0
-                                  Select Operator [SEL_106] (rows=5 width=178)
+                                  Select Operator [SEL_106] (rows=25 width=178)
                                     Output:["_col0"]
-                                    Filter Operator [FIL_103] (rows=5 
width=178)
+                                    Filter Operator [FIL_103] (rows=25 
width=178)
                                       predicate:((value) IN ('2000Q1', 
'2000Q2', '2000Q3') and key is not null)
                                       TableScan [TS_3] (rows=500 width=178)
                                         
default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
@@ -402,14 +402,14 @@ Stage-0
                     <-Reducer 2 [SIMPLE_EDGE] llap
                       SHUFFLE [RS_44]
                         PartitionCols:_col1, _col3
-                        Merge Join Operator [MERGEJOIN_91] (rows=70 width=269)
+                        Merge Join Operator [MERGEJOIN_91] (rows=265 width=269)
                           
Conds:RS_100._col0=RS_107._col0(Inner),Output:["_col1","_col2","_col3"]
                         <-Map 6 [SIMPLE_EDGE] vectorized, llap
                           SHUFFLE [RS_107]
                             PartitionCols:_col0
-                            Select Operator [SEL_104] (rows=5 width=178)
+                            Select Operator [SEL_104] (rows=25 width=178)
                               Output:["_col0"]
-                              Filter Operator [FIL_101] (rows=5 width=178)
+                              Filter Operator [FIL_101] (rows=25 width=178)
                                 predicate:((value) IN ('2000Q1', '2000Q2', 
'2000Q3') and key is not null)
                                  Please refer to the previous TableScan [TS_3]
                         <-Map 1 [SIMPLE_EDGE] vectorized, llap

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
index f76053e..b1c0bab 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
@@ -57,7 +57,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: FilterLongColumnInList(col 
3:date, values [-67, -171])
                     predicate: (cdate) IN (DATE'1969-10-26', DATE'1969-07-14') 
(type: boolean)
-                    Statistics: Num rows: 10 Data size: 532 Basic stats: 
COMPLETE Column stats: NONE
+                    Statistics: Num rows: 1 Data size: 53 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cdate (type: date)
                       outputColumnNames: _col0
@@ -65,7 +65,7 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [3]
-                      Statistics: Num rows: 10 Data size: 532 Basic stats: 
COMPLETE Column stats: NONE
+                      Statistics: Num rows: 1 Data size: 53 Basic stats: 
COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: date)
                         sort order: +
@@ -73,7 +73,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 10 Data size: 532 Basic stats: 
COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1 Data size: 53 Basic stats: 
COMPLETE Column stats: NONE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
             Map Vectorization:
@@ -101,13 +101,13 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0]
-                Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE 
Column stats: NONE
+                Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE 
Column stats: NONE
                 File Output Operator
                   compressed: false
                   File Sink Vectorization:
                       className: VectorFileSinkOperator
                       native: false
-                  Statistics: Num rows: 10 Data size: 532 Basic stats: 
COMPLETE Column stats: NONE
+                  Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE 
Column stats: NONE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -152,13 +152,13 @@ STAGE PLANS:
                         native: true
                         predicateExpression: SelectColumnIsFalse(col 
5:boolean)(children: LongColumnInList(col 3, values [-67, -171, 20]) -> 
5:boolean)
                     predicate: (not (cdate) IN (DATE'1969-10-26', 
DATE'1969-07-14', DATE'1970-01-21')) (type: boolean)
-                    Statistics: Num rows: 12274 Data size: 653057 Basic stats: 
COMPLETE Column stats: NONE
+                    Statistics: Num rows: 12284 Data size: 653589 Basic stats: 
COMPLETE Column stats: NONE
                     Select Operator
                       Select Vectorization:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: []
-                      Statistics: Num rows: 12274 Data size: 653057 Basic 
stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 12284 Data size: 653589 Basic 
stats: COMPLETE Column stats: NONE
                       Group By Operator
                         aggregations: count()
                         Group By Vectorization:

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out 
b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
index 5afa99d..f210b72 100644
--- a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
@@ -847,7 +847,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: 
FilterStructColumnInList(structExpressions [col 0:bigint, col 1:string, col 
2:double], fieldVectorColumnTypes [LONG, BYTES, DOUBLE], structColumnMap [0, 1, 
2])
                     predicate: (struct(my_bigint,my_string,my_double)) IN 
(const struct(1L,'a',1.5D), const struct(1L,'b',-0.5D), const 
struct(3L,'b',1.5D), const struct(1L,'d',1.5D), const struct(1L,'c',1.5D), 
const struct(1L,'b',2.5D), const struct(1L,'b',0.5D), const 
struct(5L,'b',1.5D), const struct(1L,'a',0.5D), const struct(3L,'b',1.5D)) 
(type: boolean)
-                    Statistics: Num rows: 3 Data size: 303 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 202 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: my_bigint (type: bigint), my_string (type: 
string), my_double (type: double)
                       outputColumnNames: _col0, _col1, _col2
@@ -855,13 +855,13 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [0, 1, 2]
-                      Statistics: Num rows: 3 Data size: 303 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 202 Basic stats: 
COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
                         File Sink Vectorization:
                             className: VectorFileSinkOperator
                             native: false
-                        Statistics: Num rows: 3 Data size: 303 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 202 Basic stats: 
COMPLETE Column stats: COMPLETE
                         table:
                             input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                             output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/603b0f64/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out 
b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
index c3d810e..3d00bbe 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
@@ -30975,19 +30975,19 @@ STAGE PLANS:
                   Filter Operator
                     isSamplingPred: false
                     predicate: (cstring1) IN ('biology', 'history', 
'topology') (type: boolean)
-                    Statistics: Num rows: 6 Data size: 470 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 614 Data size: 43146 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
                       aggregations: count()
                       keys: cstring1 (type: string)
                       mode: hash
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 3 Data size: 306 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: string)
                         null sort order: a
                         sort order: +
                         Map-reduce partition columns: _col0 (type: string)
-                        Statistics: Num rows: 3 Data size: 306 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 304 Data size: 23864 Basic 
stats: COMPLETE Column stats: COMPLETE
                         tag: -1
                         value expressions: _col1 (type: bigint)
                         auto parallelism: true
@@ -31055,16 +31055,16 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col1 (type: bigint), _col0 (type: string)
                   outputColumnNames: _col0, _col1
-                  Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col1 (type: string)
                     null sort order: a
                     sort order: +
-                    Statistics: Num rows: 3 Data size: 306 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
                     tag: -1
                     value expressions: _col0 (type: bigint)
                     auto parallelism: false
@@ -31075,13 +31075,13 @@ STAGE PLANS:
               Select Operator
                 expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 
(type: string)
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
                   GlobalTableId: 0
 #### A masked pattern was here ####
                   NumFilesPerFileSink: 1
-                  Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 304 Data size: 23864 Basic stats: 
COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat

hive git commit: HIVE-19460: Improve stats estimations for NOT IN operator (Zoltan Haindrich reviewed by Ashutosh Chauhan)

Reply via email to