Repository: hive Updated Branches: refs/heads/master b382c5082 -> f65739523
HIVE-14018: Make IN clause row selectivity estimation customizable (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f6573952 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f6573952 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f6573952 Branch: refs/heads/master Commit: f65739523338f23c2d960ceda59d660ef78dd315 Parents: b382c50 Author: Jesus Camacho Rodriguez <[email protected]> Authored: Fri Jun 17 12:06:19 2016 +0100 Committer: Jesus Camacho Rodriguez <[email protected]> Committed: Fri Jun 17 12:06:19 2016 +0100 ---------------------------------------------------------------------- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java | 5 +++++ .../ql/optimizer/stats/annotation/StatsRulesProcFactory.java | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/f6573952/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index cc95008..1e0ffa4 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1624,6 +1624,11 @@ public class HiveConf extends Configuration { "of rows and data size. Since files in tables/partitions are serialized (and optionally\n" + "compressed) the estimates of number of rows and data size cannot be reliably determined.\n" + "This factor is multiplied with the file size to account for serialization and compression."), + HIVE_STATS_IN_CLAUSE_FACTOR("hive.stats.filter.in.factor", (float) 1.0, + "Currently column distribution is assumed to be uniform. This can lead to overestimation/underestimation\n" + + "in the number of rows filtered by a certain operator, which in turn might lead to overprovision or\n" + + "underprovision of resources. This factor is applied to the cardinality estimation of IN clauses in\n" + + "filter operators."), // Concurrency HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false, http://git-wip-us.apache.org/repos/asf/hive/blob/f6573952/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 3f82594..5625091 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -473,7 +473,8 @@ public class StatsRulesProcFactory { float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size(); factor *= columnFactor; } - return Math.round( (double)numRows * factor); + float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); + return Math.round( (double)numRows * factor * inFactor); } private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred,
