Author: travis
Date: Tue Sep 25 21:54:22 2012
New Revision: 1390178
URL: http://svn.apache.org/viewvc?rev=1390178&view=rev
Log:
HCATALOG-506 desired number of input splits for large files
Modified:
incubator/hcatalog/trunk/CHANGES.txt
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
Modified: incubator/hcatalog/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/hcatalog/trunk/CHANGES.txt?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/CHANGES.txt (original)
+++ incubator/hcatalog/trunk/CHANGES.txt Tue Sep 25 21:54:22 2012
@@ -40,6 +40,8 @@ Trunk (unreleased changes)
HCAT-427 Document storage-based authorization (lefty via gates)
IMPROVEMENTS
+ HCAT-506 desired number of input splits for large files (gmalewicz via
traviscrawford)
+
HCAT-461 Refactor server-extensions as a subproject (traviscrawford)
HCAT-500 HCatStorer should honor user-specified path for external tables
(pengfeng via traviscrawford)
Modified:
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
URL:
http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
---
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
(original)
+++
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
Tue Sep 25 21:54:22 2012
@@ -79,6 +79,16 @@ public final class HCatConstants {
public static final String HCAT_METASTORE_PRINCIPAL
= HiveConf.ConfVars.METASTORE_KERBEROS_PRINCIPAL.varname;
+ /**
+ * The desired number of input splits produced for each partition. When the
+ * input files are large and few, we want to split them into many splits,
+ * so as to increase the parallelizm of loading the splits. Try also two
+ * other parameters, mapred.min.split.size and mapred.max.split.size, to
+ * control the number of input splits.
+ */
+ public static final String HCAT_DESIRED_PARTITION_NUM_SPLITS =
+ "hcat.desired.partition.num.splits";
+
// IMPORTANT IMPORTANT IMPORTANT!!!!!
//The keys used to store info into the job Configuration.
//If any new keys are added, the HCatStorer needs to be updated. The
HCatStorer
Modified:
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
URL:
http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
---
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
(original)
+++
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
Tue Sep 25 21:54:22 2012
@@ -142,11 +142,17 @@ public abstract class HCatBaseInputForma
org.apache.hadoop.mapred.InputFormat inputFormat =
getMapRedInputFormat(jobConf, inputFormatClass);
- //Call getSplit on the InputFormat, create an
- //HCatSplit for each underlying split
- //NumSplits is 0 for our purposes
+ //Call getSplit on the InputFormat, create an HCatSplit for each
+ //underlying split. When the desired number of input splits is
missing,
+ //use a default number (denoted by zero).
+ //TODO(malewicz): Currently each partition is split independently
into
+ //a desired number. However, we want the union of all partitions
to be
+ //split into a desired number while maintaining balanced sizes of
input
+ //splits.
+ int desiredNumSplits =
+ conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS,
0);
org.apache.hadoop.mapred.InputSplit[] baseSplits =
- inputFormat.getSplits(jobConf, 0);
+ inputFormat.getSplits(jobConf, desiredNumSplits);
for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
splits.add(new HCatSplit(