Author: gunther
Date: Mon Feb  3 22:11:50 2014
New Revision: 1564102

URL: http://svn.apache.org/r1564102
Log:
HIVE-6298: Add config flag to turn off fetching partition stats (Patch by 
Gunther Hagleitner, reviewed by Sergey Shelukhin and Prasanth J)

Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1564102&r1=1564101&r2=1564102&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Mon 
Feb  3 22:11:50 2014
@@ -654,6 +654,9 @@ public class HiveConf extends Configurat
     HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
     // to accurately compute statistics for GROUPBY map side parallelism needs 
to be known
     HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
+    // statistics annotation fetches stats for each partition, which can be 
expensive. turning
+    // this off will result in basic sizes being fetched from namenode instead
+    HIVE_STATS_FETCH_PARTITION_STATS("hive.stats.fetch.partition.stats", true),
     // statistics annotation fetches column statistics for all required 
columns which can
     // be very expensive sometimes
     HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1564102&r1=1564101&r2=1564102&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Mon 
Feb  3 22:11:50 2014
@@ -110,6 +110,8 @@ public class StatsUtils {
     List<String> neededColumns = tableScanOperator.getNeededColumns();
     boolean fetchColStats =
         HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
+    boolean fetchPartStats =
+        HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS);
     float deserFactor =
         HiveConf.getFloatVar(conf, 
HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
 
@@ -151,27 +153,34 @@ public class StatsUtils {
     } else if (partList != null) {
       // For partitioned tables, get the size of all the partitions after 
pruning
       // the partitions that are not required
-      List<Long> rowCounts = getBasicStatForPartitions(
-          table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
-      List<Long> dataSizes =  getBasicStatForPartitions(
-          table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
+      long nr = 0;
+      long ds = 0;
 
-      long nr = getSumIgnoreNegatives(rowCounts);
-      long ds = getSumIgnoreNegatives(dataSizes);
-      if (ds <= 0) {
-        dataSizes = getBasicStatForPartitions(
-            table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
-        ds = getSumIgnoreNegatives(dataSizes);
+      List<Long> rowCounts = Lists.newArrayList();
+      List<Long> dataSizes = Lists.newArrayList();
+
+      if (fetchPartStats) {
+        rowCounts = getBasicStatForPartitions(
+            table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
+        dataSizes =  getBasicStatForPartitions(
+            table, partList.getNotDeniedPartns(), 
StatsSetupConst.RAW_DATA_SIZE);
 
-        // if data size still could not be determined, then fall back to 
filesytem to get file
-        // sizes
+        nr = getSumIgnoreNegatives(rowCounts);
+        ds = getSumIgnoreNegatives(dataSizes);
         if (ds <= 0) {
-          dataSizes = getFileSizeForPartitions(conf, 
partList.getNotDeniedPartns());
+          dataSizes = getBasicStatForPartitions(
+              table, partList.getNotDeniedPartns(), 
StatsSetupConst.TOTAL_SIZE);
+          ds = getSumIgnoreNegatives(dataSizes);
         }
-        ds = getSumIgnoreNegatives(dataSizes);
+      }
 
-        ds = (long) (ds * deserFactor);
+      // if data size still could not be determined, then fall back to 
filesytem to get file
+      // sizes
+      if (ds <= 0) {
+        dataSizes = getFileSizeForPartitions(conf, 
partList.getNotDeniedPartns());
       }
+      ds = getSumIgnoreNegatives(dataSizes);
+      ds = (long) (ds * deserFactor);
 
       int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
       if (avgRowSize > 0) {


Reply via email to