Author: gunther
Date: Mon Feb 3 22:11:50 2014
New Revision: 1564102
URL: http://svn.apache.org/r1564102
Log:
HIVE-6298: Add config flag to turn off fetching partition stats (Patch by
Gunther Hagleitner, reviewed by Sergey Shelukhin and Prasanth J)
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL:
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1564102&r1=1564101&r2=1564102&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Mon
Feb 3 22:11:50 2014
@@ -654,6 +654,9 @@ public class HiveConf extends Configurat
HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
// to accurately compute statistics for GROUPBY map side parallelism needs
to be known
HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
+ // statistics annotation fetches stats for each partition, which can be
expensive. turning
+ // this off will result in basic sizes being fetched from namenode instead
+ HIVE_STATS_FETCH_PARTITION_STATS("hive.stats.fetch.partition.stats", true),
// statistics annotation fetches column statistics for all required
columns which can
// be very expensive sometimes
HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false),
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1564102&r1=1564101&r2=1564102&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon
Feb 3 22:11:50 2014
@@ -110,6 +110,8 @@ public class StatsUtils {
List<String> neededColumns = tableScanOperator.getNeededColumns();
boolean fetchColStats =
HiveConf.getBoolVar(conf,
HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
+ boolean fetchPartStats =
+ HiveConf.getBoolVar(conf,
HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS);
float deserFactor =
HiveConf.getFloatVar(conf,
HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
@@ -151,27 +153,34 @@ public class StatsUtils {
} else if (partList != null) {
// For partitioned tables, get the size of all the partitions after
pruning
// the partitions that are not required
- List<Long> rowCounts = getBasicStatForPartitions(
- table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
- List<Long> dataSizes = getBasicStatForPartitions(
- table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
+ long nr = 0;
+ long ds = 0;
- long nr = getSumIgnoreNegatives(rowCounts);
- long ds = getSumIgnoreNegatives(dataSizes);
- if (ds <= 0) {
- dataSizes = getBasicStatForPartitions(
- table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
- ds = getSumIgnoreNegatives(dataSizes);
+ List<Long> rowCounts = Lists.newArrayList();
+ List<Long> dataSizes = Lists.newArrayList();
+
+ if (fetchPartStats) {
+ rowCounts = getBasicStatForPartitions(
+ table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
+ dataSizes = getBasicStatForPartitions(
+ table, partList.getNotDeniedPartns(),
StatsSetupConst.RAW_DATA_SIZE);
- // if data size still could not be determined, then fall back to
filesytem to get file
- // sizes
+ nr = getSumIgnoreNegatives(rowCounts);
+ ds = getSumIgnoreNegatives(dataSizes);
if (ds <= 0) {
- dataSizes = getFileSizeForPartitions(conf,
partList.getNotDeniedPartns());
+ dataSizes = getBasicStatForPartitions(
+ table, partList.getNotDeniedPartns(),
StatsSetupConst.TOTAL_SIZE);
+ ds = getSumIgnoreNegatives(dataSizes);
}
- ds = getSumIgnoreNegatives(dataSizes);
+ }
- ds = (long) (ds * deserFactor);
+ // if data size still could not be determined, then fall back to
filesytem to get file
+ // sizes
+ if (ds <= 0) {
+ dataSizes = getFileSizeForPartitions(conf,
partList.getNotDeniedPartns());
}
+ ds = getSumIgnoreNegatives(dataSizes);
+ ds = (long) (ds * deserFactor);
int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
if (avgRowSize > 0) {