Repository: hive Updated Branches: refs/heads/branch-2.2 ab3af5b02 -> f31f7495e (forced update)
HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) Change-Id: I81ab162ba9df684d1c0c8b9f195779144b3d465a Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ceea54fd Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ceea54fd Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ceea54fd Branch: refs/heads/branch-2.2 Commit: ceea54fdf9bf646f1f782071f042ab24350e2540 Parents: 9e7daaf Author: Jesus Camacho Rodriguez <[email protected]> Authored: Mon Feb 27 15:22:23 2017 +0000 Committer: Owen O'Malley <[email protected]> Committed: Tue Mar 28 15:27:49 2017 -0700 ---------------------------------------------------------------------- .../stats/annotation/StatsRulesProcFactory.java | 46 +++++++++++++------- .../clientpositive/remove_exprs_stats.q.out | 18 ++++---- 2 files changed, 39 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/ceea54fd/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 1000dc9..faf26c5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -287,8 +287,13 @@ public class StatsRulesProcFactory { long newNumRows = 0; Statistics andStats = null; - if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) + if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) { + if (isDebugEnabled) { + LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() + + " Original data size: " + stats.getDataSize() + " New num rows: 1"); + } return 1; + } if (pred instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred; @@ -345,23 +350,31 @@ public class StatsRulesProcFactory { if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) { ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { - return cs.getNumTrues(); + newNumRows = cs.getNumTrues(); + } else { + // default + newNumRows = stats.getNumRows() / 2; } + } else { + // if not boolean column return half the number of rows + newNumRows = stats.getNumRows() / 2; } - - // if not boolean column return half the number of rows - return stats.getNumRows() / 2; } else if (pred instanceof ExprNodeConstantDesc) { // special case for handling false constants ExprNodeConstantDesc encd = (ExprNodeConstantDesc) pred; if (Boolean.FALSE.equals(encd.getValue())) { - return 0; + newNumRows = 0; } else { - return stats.getNumRows(); + newNumRows = stats.getNumRows(); } } + if (isDebugEnabled) { + LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() + + " New num rows: " + newNumRows); + } + return newNumRows; } @@ -443,15 +456,16 @@ public class StatsRulesProcFactory { } // 3. Calculate IN selectivity - float factor = 1; + double factor = 1d; for (int i = 0; i < columnStats.size(); i++) { long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint(); - // ( num of distinct vals for col / num of rows ) * num of distinct vals for col in IN clause - float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size(); - factor *= columnFactor; + // (num of distinct vals for col in IN clause / num of distinct vals for col ) + double columnFactor = dvs == 0 ? 0.5d : ((double) values.get(i).size() / dvs); + // max can be 1, even when ndv is larger in IN clause than in column stats + factor *= columnFactor > 1d ? 1d : columnFactor; } float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); - return Math.round( (double)numRows * factor * inFactor); + return Math.round( (double) numRows * factor * inFactor); } private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, @@ -1790,11 +1804,11 @@ public class StatsRulesProcFactory { Map<Integer, Long> rowCountParents) { if (newNumRows < 0) { - LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows." + LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE"); } if (newNumRows == 0) { - LOG.info("STATS-" + jop.toString() + ": Equals 0 in number of rows." + LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1"); newNumRows = 1; } @@ -2214,12 +2228,12 @@ public class StatsRulesProcFactory { boolean updateNDV) { if (newNumRows < 0) { - LOG.info("STATS-" + op.toString() + ": Overflow in number of rows." + LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE"); newNumRows = StatsUtils.getMaxIfOverflow(newNumRows); } if (newNumRows == 0) { - LOG.info("STATS-" + op.toString() + ": Equals 0 in number of rows." + LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1"); newNumRows = 1; } http://git-wip-us.apache.org/repos/asf/hive/blob/ceea54fd/ql/src/test/results/clientpositive/remove_exprs_stats.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out index 15e2e29..b1ba942 100644 --- a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out +++ b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out @@ -449,14 +449,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (5) (type: boolean) - Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -485,14 +485,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (5, 2, 3) (type: boolean) - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 498 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -521,14 +521,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid) IN (1, 6) (type: boolean) - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
