[01/50] [abbrv] hive git commit: HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) [Forced Update!]

omalley Tue, 28 Mar 2017 15:31:44 -0700

Repository: hive
Updated Branches:
  refs/heads/branch-2.2 ab3af5b02 -> f31f7495e (forced update)



HIVE-16023: Wrong estimation for number of rows generated by IN expression 
(Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)

Change-Id: I81ab162ba9df684d1c0c8b9f195779144b3d465a


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ceea54fd
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ceea54fd
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ceea54fd

Branch: refs/heads/branch-2.2
Commit: ceea54fdf9bf646f1f782071f042ab24350e2540
Parents: 9e7daaf
Author: Jesus Camacho Rodriguez <[email protected]>
Authored: Mon Feb 27 15:22:23 2017 +0000
Committer: Owen O'Malley <[email protected]>
Committed: Tue Mar 28 15:27:49 2017 -0700

----------------------------------------------------------------------
 .../stats/annotation/StatsRulesProcFactory.java | 46 +++++++++++++-------
 .../clientpositive/remove_exprs_stats.q.out     | 18 ++++----
 2 files changed, 39 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/ceea54fd/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 1000dc9..faf26c5 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -287,8 +287,13 @@ public class StatsRulesProcFactory {
       long newNumRows = 0;
       Statistics andStats = null;
 
-      if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0)
+      if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) {
+        if (isDebugEnabled) {
+          LOG.debug("Estimating row count for " + pred + " Original num rows: 
" + stats.getNumRows() +
+              " Original data size: " + stats.getDataSize() + " New num rows: 
1");
+        }
         return 1;
+      }
 
       if (pred instanceof ExprNodeGenericFuncDesc) {
         ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
@@ -345,23 +350,31 @@ public class StatsRulesProcFactory {
         if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
           ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
           if (cs != null) {
-            return cs.getNumTrues();
+            newNumRows = cs.getNumTrues();
+          } else {
+            // default
+            newNumRows = stats.getNumRows() / 2;
           }
+        } else {
+          // if not boolean column return half the number of rows
+          newNumRows = stats.getNumRows() / 2;
         }
-
-        // if not boolean column return half the number of rows
-        return stats.getNumRows() / 2;
       } else if (pred instanceof ExprNodeConstantDesc) {
 
         // special case for handling false constants
         ExprNodeConstantDesc encd = (ExprNodeConstantDesc) pred;
         if (Boolean.FALSE.equals(encd.getValue())) {
-          return 0;
+          newNumRows = 0;
         } else {
-          return stats.getNumRows();
+          newNumRows = stats.getNumRows();
         }
       }
 
+      if (isDebugEnabled) {
+        LOG.debug("Estimating row count for " + pred + " Original num rows: " 
+ stats.getNumRows() +
+            " New num rows: " + newNumRows);
+      }
+
       return newNumRows;
     }
 
@@ -443,15 +456,16 @@ public class StatsRulesProcFactory {
       }
 
       // 3. Calculate IN selectivity
-      float factor = 1;
+      double factor = 1d;
       for (int i = 0; i < columnStats.size(); i++) {
         long dvs = columnStats.get(i) == null ? 0 : 
columnStats.get(i).getCountDistint();
-        // ( num of distinct vals for col / num of rows ) * num of distinct 
vals for col in IN clause
-        float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * 
values.get(i).size();
-        factor *= columnFactor;
+        // (num of distinct vals for col in IN clause  / num of distinct vals 
for col )
+        double columnFactor = dvs == 0 ? 0.5d : ((double) values.get(i).size() 
/ dvs);
+        // max can be 1, even when ndv is larger in IN clause than in column 
stats
+        factor *= columnFactor > 1d ? 1d : columnFactor;
       }
       float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), 
HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
-      return Math.round( (double)numRows * factor * inFactor);
+      return Math.round( (double) numRows * factor * inFactor);
     }
 
     private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, 
AnnotateStatsProcCtx aspCtx,
@@ -1790,11 +1804,11 @@ public class StatsRulesProcFactory {
         Map<Integer, Long> rowCountParents) {
 
       if (newNumRows < 0) {
-        LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows."
+        LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. "
           + newNumRows + " rows will be set to Long.MAX_VALUE");
       }
       if (newNumRows == 0) {
-        LOG.info("STATS-" + jop.toString() + ": Equals 0 in number of rows."
+        LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. "
             + newNumRows + " rows will be set to 1");
         newNumRows = 1;
       }
@@ -2214,12 +2228,12 @@ public class StatsRulesProcFactory {
       boolean updateNDV) {
 
     if (newNumRows < 0) {
-      LOG.info("STATS-" + op.toString() + ": Overflow in number of rows."
+      LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. "
           + newNumRows + " rows will be set to Long.MAX_VALUE");
       newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
     }
     if (newNumRows == 0) {
-      LOG.info("STATS-" + op.toString() + ": Equals 0 in number of rows."
+      LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. "
           + newNumRows + " rows will be set to 1");
       newNumRows = 1;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/ceea54fd/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out 
b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
index 15e2e29..b1ba942 100644
--- a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
+++ b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
@@ -449,14 +449,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (5) (type: boolean)
-              Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
+              Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE 
Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip 
(type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE 
Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE 
Column stats: COMPLETE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -485,14 +485,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (5, 2, 3) (type: boolean)
-              Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+              Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip 
(type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE 
Column stats: COMPLETE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -521,14 +521,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (1, 6) (type: boolean)
-              Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+              Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip 
(type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE 
Column stats: COMPLETE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

[01/50] [abbrv] hive git commit: HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) [Forced Update!]

Reply via email to