Revert "HIVE-11043: ORC split strategies should adapt based on number of files (Gopal V reviewed by Prasanth Jayachandran)"
This reverts commit 5f78f9ef1e6c798849d34cc66721e6c1d9709b6f. Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/eb278d3c Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/eb278d3c Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/eb278d3c Branch: refs/heads/beeline-cli Commit: eb278d3c5a2ce6f11a3917f8646931630c1ee05e Parents: 74a61e0 Author: Prasanth Jayachandran <[email protected]> Authored: Tue Jun 23 20:50:39 2015 -0700 Committer: Prasanth Jayachandran <[email protected]> Committed: Tue Jun 23 20:51:22 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 18 +--- .../hive/ql/io/orc/TestInputOutputFormat.java | 97 +------------------- 2 files changed, 4 insertions(+), 111 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/eb278d3c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 62e6de7..5d6c9da 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -374,7 +374,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, private final int numBuckets; private final long maxSize; private final long minSize; - private final int minSplits; private final boolean footerInSplits; private final boolean cacheStripeDetails; private final AtomicInteger cacheHitCounter = new AtomicInteger(0); @@ -383,10 +382,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, private SplitStrategyKind splitStrategyKind; Context(Configuration conf) { - this(conf, 1); - } - - Context(Configuration conf, final int minSplits) { this.conf = conf; minSize = conf.getLong(MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE); maxSize = conf.getLong(MAX_SPLIT_SIZE, DEFAULT_MAX_SPLIT_SIZE); @@ -409,8 +404,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, cacheStripeDetails = (cacheStripeDetailsSize > 0); - this.minSplits = Math.min(cacheStripeDetailsSize, minSplits); - synchronized (Context.class) { if (threadPool == null) { threadPool = Executors.newFixedThreadPool(numThreads, @@ -688,7 +681,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, break; default: // HYBRID strategy - if (avgFileSize > context.maxSize || numFiles <= context.minSplits) { + if (avgFileSize > context.maxSize) { splitStrategy = new ETLSplitStrategy(context, fs, dir, children, isOriginal, deltas, covered); } else { @@ -990,13 +983,8 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, static List<OrcSplit> generateSplitsInfo(Configuration conf) throws IOException { - return generateSplitsInfo(conf, -1); - } - - static List<OrcSplit> generateSplitsInfo(Configuration conf, int numSplits) - throws IOException { // use threads to resolve directories into splits - Context context = new Context(conf, numSplits); + Context context = new Context(conf); List<OrcSplit> splits = Lists.newArrayList(); List<Future<?>> pathFutures = Lists.newArrayList(); List<Future<?>> splitFutures = Lists.newArrayList(); @@ -1061,7 +1049,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); - List<OrcSplit> result = generateSplitsInfo(job, numSplits); + List<OrcSplit> result = generateSplitsInfo(job); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); return result.toArray(new InputSplit[result.size()]); } http://git-wip-us.apache.org/repos/asf/hive/blob/eb278d3c/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 12ae902..0246cd5 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -23,11 +23,8 @@ import static org.junit.Assert.assertTrue; import java.io.DataInput; import java.io.DataOutput; -import java.io.File; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.PrintWriter; import java.net.URI; import java.net.URISyntaxException; import java.sql.Date; @@ -70,7 +67,6 @@ import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.InputFormatChecker; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategyKind; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; @@ -398,97 +394,6 @@ public class TestInputOutputFormat { OrcInputFormat.getInputPaths(conf)); } - private FileSystem generateMockFiles(final int count, final int size) { - final byte[] data = new byte[size]; - MockFile[] files = new MockFile[count]; - for (int i = 0; i < count; i++) { - files[i] = new MockFile(String.format("mock:/a/b/part-%d", i), size, data); - } - return new MockFileSystem(conf, files); - } - - @Test - public void testSplitStrategySelection() throws Exception { - - conf.set("mapreduce.input.fileinputformat.split.maxsize", "500"); - conf.setLong(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE.varname, - 100); - final int[] counts = { 1, 10, 100, 256 }; - final int[] sizes = { 100, 1000 }; - final int[] numSplits = { 1, 9, 10, 11, 99, 111 }; - final String[] strategyResults = new String[] { - "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */ - "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */ - "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */ - "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */ - "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */ - "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */ - "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */ - "BISplitStrategy", /* 10 files x 100 size for 1 splits */ - "BISplitStrategy", /* 10 files x 100 size for 9 splits */ - "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */ - "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */ - "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */ - "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */ - "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */ - "BISplitStrategy", /* 100 files x 100 size for 1 splits */ - "BISplitStrategy", /* 100 files x 100 size for 9 splits */ - "BISplitStrategy", /* 100 files x 100 size for 10 splits */ - "BISplitStrategy", /* 100 files x 100 size for 11 splits */ - "BISplitStrategy", /* 100 files x 100 size for 99 splits */ - "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */ - "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */ - "BISplitStrategy", /* 256 files x 100 size for 1 splits */ - "BISplitStrategy", /* 256 files x 100 size for 9 splits */ - "BISplitStrategy", /* 256 files x 100 size for 10 splits */ - "BISplitStrategy", /* 256 files x 100 size for 11 splits */ - "BISplitStrategy", /* 256 files x 100 size for 99 splits */ - "BISplitStrategy", /* 256 files x 100 size for 111 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */ - "ETLSplitStrategy", /* 256 files x 1000 size for 111 splits */ - }; - - int k = 0; - - for (int c : counts) { - for (int s : sizes) { - final FileSystem fs = generateMockFiles(c, s); - for (int n : numSplits) { - final OrcInputFormat.Context context = new OrcInputFormat.Context( - conf, n); - OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator( - context, fs, new MockPath(fs, "mock:/a/b")); - final SplitStrategy splitStrategy = gen.call(); - assertTrue( - String.format( - "Split strategy for %d files x %d size for %d splits", c, s, - n), - splitStrategy.getClass().getSimpleName() - .equals(strategyResults[k++])); - } - } - } - } - @Test public void testFileGenerator() throws Exception { OrcInputFormat.Context context = new OrcInputFormat.Context(conf); @@ -1210,7 +1115,7 @@ public class TestInputOutputFormat { InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); - assertTrue(0 == splits.length); + assertTrue(1 == splits.length); assertEquals(null, serde.getSerDeStats()); }
