StatsRulesProcFactory.java

rhbutani Wed, 29 Jan 2014 10:59:22 -0800

Author: rhbutani
Date: Wed Jan 29 18:58:19 2014
New Revision: 1562547

URL: http://svn.apache.org/r1562547
Log:
HIVE-6300 Add documentation for stats configs to hive-default.xml.template 
(Prasanth J via Harish Butani, Lefty Leverenz)


Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml.template
    
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Wed 
Jan 29 18:58:19 2014
@@ -652,14 +652,11 @@ public class HiveConf extends Configurat
     HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
     // to accurately compute statistics for GROUPBY map side parallelism needs 
to be known
     HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
-    // statistics annotation fetches column statistics for all required 
columns and for all
-    // required partitions which can be very expensive sometimes
+    // statistics annotation fetches column statistics for all required 
columns which can
+    // be very expensive sometimes
     HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false),
-    // in the absence of table/partition stats, average row size will be used 
to
-    // estimate the number of rows/data size
-    HIVE_STATS_AVG_ROW_SIZE("hive.stats.avg.row.size", 10000),
     // in the absence of column statistics, the estimated number of rows/data 
size that will
-    // emitted from join operator will depend on t factor
+    // be emitted from join operator will depend on this factor
     HIVE_STATS_JOIN_FACTOR("hive.stats.join.factor", (float) 1.1),
     // in the absence of uncompressed/raw data size, total file size will be 
used for statistics
     // annotation. But the file may be compressed, encoded and serialized 
which may be lesser in size

Modified: hive/trunk/conf/hive-default.xml.template
URL: 
http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Wed Jan 29 18:58:19 2014
@@ -1322,6 +1322,102 @@
 </property>
 
 <property>
+  <name>hive.stats.max.variable.length</name>
+  <value>100</value>
+  <description>
+    To estimate the size of data flowing through operators in Hive/Tez(for 
reducer estimation etc.),
+    average row size is multiplied with the total number of rows coming out of 
each operator.
+    Average row size is computed from average column size of all columns in 
the row. In the absence
+    of column statistics, for variable length columns (like string, bytes 
etc.), this value will be
+    used. For fixed length columns their corresponding Java equivalent sizes 
are used
+    (float - 4 bytes, double - 8 bytes etc.).
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.list.num.entries</name>
+  <value>10</value>
+  <description>
+    To estimate the size of data flowing through operators in Hive/Tez(for 
reducer estimation etc.),
+    average row size is multiplied with the total number of rows coming out of 
each operator.
+    Average row size is computed from average column size of all columns in 
the row. In the absence
+    of column statistics and for variable length complex columns like list, 
the average number of
+    entries/values can be specified using this config.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.map.num.entries</name>
+  <value>10</value>
+  <description>
+    To estimate the size of data flowing through operators in Hive/Tez(for 
reducer estimation etc.),
+    average row size is multiplied with the total number of rows coming out of 
each operator.
+    Average row size is computed from average column size of all columns in 
the row. In the absence
+    of column statistics and for variable length complex columns like map, the 
average number of
+    entries/values can be specified using this config.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.map.parallelism</name>
+  <value>1</value>
+  <description>
+    Hive/Tez optimizer estimates the data size flowing through each of the 
operators.
+    For GROUPBY operator, to accurately compute the data size map-side 
parallelism needs to
+    be known. By default, this value is set to 1 since optimizer is not aware 
of the number of
+    mappers during compile-time. This Hive config can be used to specify the 
number of mappers
+    to be used for data size computation of GROUPBY operator.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.fetch.column.stats</name>
+  <value>false</value>
+  <description>
+    Annotation of operator tree with statistics information requires column 
statisitcs.
+    Column statistics are fetched from metastore. Fetching column statistics 
for each needed column
+    can be expensive when the number of columns is high. This flag can be used 
to disable fetching
+    of column statistics from metastore.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.fetch.partition.stats</name>
+  <value>true</value>
+  <description>
+    Annotation of operator tree with statistics information requires partition 
level basic
+    statisitcs like number of rows, data size and file size. Partition 
statistics are fetched from
+    metastore. Fetching partition statistics for each needed partition can be 
expensive when the
+    number of partitions is high. This flag can be used to disable fetching of 
partition statistics
+    from metastore. When this flag is disabled, Hive will make calls to 
filesystem to get file sizes
+    and will estimate the number of rows from row schema.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.join.factor</name>
+  <value>1.1</value>
+  <description>
+    Hive/Tez optimizer estimates the data size flowing through each of the 
operators. JOIN operator
+    uses column statistics to estimate the number of rows flowing out of it 
and hence the data size.
+    In the absence of column statistics, this factor determines the amount of 
rows that flows out
+    of JOIN operator.
+  </description>
+</property>
+
+<property>
+  <name>hive.stats.deserialization.factor</name>
+  <value>1.0</value>
+  <description>
+    Hive/Tez optimizer estimates the data size flowing through each of the 
operators. In the absence
+    of basic statistics like number of rows and data size, file size is used 
to estimate the number
+    of rows and data size. Since files in tables/partitions are serialized 
(and optionally
+    compressed) the estimates of number of rows and data size cannot be 
reliably determined.
+    This factor is multiplied with the file size to account for serialization 
and compression.
+  </description>
+</property>
+
+<property>
   <name>hive.support.concurrency</name>
   <value>false</value>
   <description>Whether Hive supports concurrency or not. A ZooKeeper instance 
must be up and running for the default Hive lock manager to support read-write 
locks.</description>

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 (original)
+++ 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
 Wed Jan 29 18:58:19 2014
@@ -988,9 +988,6 @@ public class StatsRulesProcFactory {
             if (limit <= parentStats.getNumRows()) {
               long numRows = limit;
               long avgRowSize = parentStats.getAvgRowSize();
-              if (avgRowSize <= 0) {
-                avgRowSize = HiveConf.getIntVar(conf, 
HiveConf.ConfVars.HIVE_STATS_AVG_ROW_SIZE);
-              }
               long dataSize = avgRowSize * limit;
               wcStats.setNumRows(numRows);
               wcStats.setDataSize(dataSize);

svn commit: r1562547 - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java conf/hive-default.xml.template ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java

Reply via email to