Repository: hive
Updated Branches:
  refs/heads/master b382c5082 -> f65739523


HIVE-14018: Make IN clause row selectivity estimation customizable (Jesus 
Camacho Rodriguez, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f6573952
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f6573952
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f6573952

Branch: refs/heads/master
Commit: f65739523338f23c2d960ceda59d660ef78dd315
Parents: b382c50
Author: Jesus Camacho Rodriguez <[email protected]>
Authored: Fri Jun 17 12:06:19 2016 +0100
Committer: Jesus Camacho Rodriguez <[email protected]>
Committed: Fri Jun 17 12:06:19 2016 +0100

----------------------------------------------------------------------
 common/src/java/org/apache/hadoop/hive/conf/HiveConf.java       | 5 +++++
 .../ql/optimizer/stats/annotation/StatsRulesProcFactory.java    | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/f6573952/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index cc95008..1e0ffa4 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1624,6 +1624,11 @@ public class HiveConf extends Configuration {
         "of rows and data size. Since files in tables/partitions are 
serialized (and optionally\n" +
         "compressed) the estimates of number of rows and data size cannot be 
reliably determined.\n" +
         "This factor is multiplied with the file size to account for 
serialization and compression."),
+    HIVE_STATS_IN_CLAUSE_FACTOR("hive.stats.filter.in.factor", (float) 1.0,
+        "Currently column distribution is assumed to be uniform. This can lead 
to overestimation/underestimation\n" +
+        "in the number of rows filtered by a certain operator, which in turn 
might lead to overprovision or\n" +
+        "underprovision of resources. This factor is applied to the 
cardinality estimation of IN clauses in\n" +
+        "filter operators."),
 
     // Concurrency
     HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false,

http://git-wip-us.apache.org/repos/asf/hive/blob/f6573952/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 3f82594..5625091 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -473,7 +473,8 @@ public class StatsRulesProcFactory {
         float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * 
values.get(i).size();
         factor *= columnFactor;
       }
-      return Math.round( (double)numRows * factor);
+      float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), 
HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
+      return Math.round( (double)numRows * factor * inFactor);
     }
 
     private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred,

Reply via email to