HIVE-13821: OrcSplit groups all delta files together into a single split (Prasanth Jayachandran reviewed by Eugene Koifman)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/76961d1f Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/76961d1f Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/76961d1f Branch: refs/heads/java8 Commit: 76961d1f67a5d5e3614d3d81c417684fab92c6c2 Parents: 51609a0 Author: Prasanth Jayachandran <[email protected]> Authored: Wed May 25 18:22:34 2016 -0700 Committer: Prasanth Jayachandran <[email protected]> Committed: Wed May 25 18:22:34 2016 -0700 ---------------------------------------------------------------------- .../ql/exec/tez/ColumnarSplitSizeEstimator.java | 6 +++-- .../hive/ql/io/orc/TestInputOutputFormat.java | 23 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/76961d1f/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java index dfc778a..ecd4ddc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ColumnarSplitSizeEstimator.java @@ -42,7 +42,6 @@ public class ColumnarSplitSizeEstimator implements SplitSizeEstimator { if (isDebugEnabled) { LOG.debug("Estimated column projection size: " + colProjSize); } - return colProjSize; } else if (inputSplit instanceof HiveInputFormat.HiveInputSplit) { InputSplit innerSplit = ((HiveInputFormat.HiveInputSplit) inputSplit).getInputSplit(); @@ -51,9 +50,12 @@ public class ColumnarSplitSizeEstimator implements SplitSizeEstimator { if (isDebugEnabled) { LOG.debug("Estimated column projection size: " + colProjSize); } - return colProjSize; } } + if (colProjSize <= 0) { + /* columnar splits of unknown size - estimate worst-case */ + return Integer.MAX_VALUE; + } return colProjSize; } } http://git-wip-us.apache.org/repos/asf/hive/blob/76961d1f/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 4eb0249..c1ef0e7 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -56,6 +56,7 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; +import org.apache.hadoop.hive.ql.exec.tez.ColumnarSplitSizeEstimator; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; @@ -556,6 +557,28 @@ public class TestInputOutputFormat { } @Test + public void testACIDSplitStrategy() throws Exception { + conf.set("bucket_count", "2"); + OrcInputFormat.Context context = new OrcInputFormat.Context(conf); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("mock:/a/delta_000_001/part-00", 1000, new byte[1], new MockBlock("host1")), + new MockFile("mock:/a/delta_000_001/part-01", 1000, new byte[1], new MockBlock("host1")), + new MockFile("mock:/a/delta_001_002/part-02", 1000, new byte[1], new MockBlock("host1")), + new MockFile("mock:/a/delta_001_002/part-03", 1000, new byte[1], new MockBlock("host1"))); + OrcInputFormat.FileGenerator gen = + new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a"), false, null); + OrcInputFormat.SplitStrategy splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.ACIDSplitStrategy); + List<OrcSplit> splits = splitStrategy.getSplits(); + ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator(); + for (OrcSplit split: splits) { + assertEquals(Integer.MAX_VALUE, splitSizeEstimator.getEstimatedSize(split)); + } + assertEquals(2, splits.size()); + } + + @Test public void testBIStrategySplitBlockBoundary() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
