http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java deleted file mode 100644 index 4afaaf1..0000000 --- a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java +++ /dev/null @@ -1,188 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.TrackingLocalFileSystem; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; -import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.apache.orc.RecordReader; -import org.apache.orc.TypeDescription; -import org.apache.parquet.hadoop.ParquetInputFormat; -import org.openjdk.jmh.annotations.AuxCounters; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Level; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.OutputTimeUnit; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.runner.Runner; -import org.openjdk.jmh.runner.options.OptionsBuilder; - -import java.net.URI; -import java.util.List; -import java.util.concurrent.TimeUnit; - -@BenchmarkMode(Mode.AverageTime) -@Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS) -@Measurement(iterations=3, time=10, timeUnit = TimeUnit.SECONDS) -@State(Scope.Thread) -@OutputTimeUnit(TimeUnit.MICROSECONDS) -@Fork(1) -public class ColumnProjectionBenchmark { - - private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir"; - private static final Path root; - static { - String value = System.getProperty(ROOT_ENVIRONMENT_NAME); - root = value == null ? null : new Path(value); - } - - @Param({ "github", "sales", "taxi"}) - public String dataset; - - @Param({"none", "snappy", "zlib"}) - public String compression; - - @AuxCounters - @State(Scope.Thread) - public static class ExtraCounters { - long bytesRead; - long reads; - long records; - long invocations; - - @Setup(Level.Iteration) - public void clean() { - bytesRead = 0; - reads = 0; - records = 0; - invocations = 0; - } - - @TearDown(Level.Iteration) - public void print() { - System.out.println(); - System.out.println("Reads: " + reads); - System.out.println("Bytes: " + bytesRead); - System.out.println("Records: " + records); - System.out.println("Invocations: " + invocations); - } - - public long kilobytes() { - return bytesRead / 1024; - } - - public long records() { - return records; - } - } - - @Benchmark - public void orc(ExtraCounters counters) throws Exception{ - Configuration conf = new Configuration(); - TrackingLocalFileSystem fs = new TrackingLocalFileSystem(); - fs.initialize(new URI("file:///"), conf); - FileSystem.Statistics statistics = fs.getLocalStatistics(); - statistics.reset(); - OrcFile.ReaderOptions options = OrcFile.readerOptions(conf).filesystem(fs); - Path path = Utilities.getVariant(root, dataset, "orc", compression); - Reader reader = OrcFile.createReader(path, options); - TypeDescription schema = reader.getSchema(); - boolean[] include = new boolean[schema.getMaximumId() + 1]; - // select first two columns - List<TypeDescription> children = schema.getChildren(); - for(int c= children.get(0).getId(); c <= children.get(1).getMaximumId(); ++c) { - include[c] = true; - } - RecordReader rows = reader.rows(new Reader.Options() - .include(include)); - VectorizedRowBatch batch = schema.createRowBatch(); - while (rows.nextBatch(batch)) { - counters.records += batch.size; - } - rows.close(); - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - - @Benchmark - public void parquet(ExtraCounters counters) throws Exception { - JobConf conf = new JobConf(); - conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); - conf.set("fs.defaultFS", "track:///"); - if ("taxi".equals(dataset)) { - conf.set("columns", "vendor_id,pickup_time"); - conf.set("columns.types", "int,timestamp"); - } else if ("sales".equals(dataset)) { - conf.set("columns", "sales_id,customer_id"); - conf.set("columns.types", "bigint,bigint"); - } else if ("github".equals(dataset)) { - conf.set("columns", "actor,created_at"); - conf.set("columns.types", "struct<avatar_url:string,gravatar_id:string," + - "id:int,login:string,url:string>,timestamp"); - } else { - throw new IllegalArgumentException("Unknown data set " + dataset); - } - Path path = Utilities.getVariant(root, dataset, "parquet", compression); - FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", - TrackingLocalFileSystem.class); - statistics.reset(); - ParquetInputFormat<ArrayWritable> inputFormat = - new ParquetInputFormat<>(DataWritableReadSupport.class); - - NullWritable nada = NullWritable.get(); - FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{}); - org.apache.hadoop.mapred.RecordReader<NullWritable,ArrayWritable> recordReader = - new ParquetRecordReaderWrapper(inputFormat, split, conf, Reporter.NULL); - ArrayWritable value = recordReader.createValue(); - while (recordReader.next(nada, value)) { - counters.records += 1; - } - recordReader.close(); - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - public static void main(String[] args) throws Exception { - new Runner(new OptionsBuilder() - .include(ColumnProjectionBenchmark.class.getSimpleName()) - .jvmArgs("-server", "-Xms256m", "-Xmx2g", - "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build() - ).run(); - } -}
http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/CompressionKind.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java deleted file mode 100644 index 9274de3..0000000 --- a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import io.airlift.compress.snappy.SnappyCodec; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -/** - * Enum for handling the compression codecs for the benchmark - */ -public enum CompressionKind { - NONE(".none"), - ZLIB(".gz"), - SNAPPY(".snappy"); - - CompressionKind(String extendsion) { - this.extension = extendsion; - } - - private final String extension; - - public String getExtension() { - return extension; - } - - public OutputStream create(OutputStream out) throws IOException { - switch (this) { - case NONE: - return out; - case ZLIB: - return new GZIPOutputStream(out); - case SNAPPY: - return new SnappyCodec().createOutputStream(out); - default: - throw new IllegalArgumentException("Unhandled kind " + this); - } - } - - public InputStream read(InputStream in) throws IOException { - switch (this) { - case NONE: - return in; - case ZLIB: - return new GZIPInputStream(in); - case SNAPPY: - return new SnappyCodec().createInputStream(in); - default: - throw new IllegalArgumentException("Unhandled kind " + this); - } - } - - public static CompressionKind fromPath(Path path) { - String name = path.getName(); - int lastDot = name.lastIndexOf('.'); - if (lastDot >= 0) { - String ext = name.substring(lastDot); - for (CompressionKind value : values()) { - if (ext.equals(value.getExtension())) { - return value; - } - } - } - return NONE; - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/DecimalBench.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/DecimalBench.java b/java/bench/src/java/org/apache/orc/bench/DecimalBench.java deleted file mode 100644 index 71a1c33..0000000 --- a/java/bench/src/java/org/apache/orc/bench/DecimalBench.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import com.google.gson.JsonStreamParser; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumReader; -import org.apache.avro.mapred.FsInput; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.TrackingLocalFileSystem; -import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; -import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; -import org.apache.orc.CompressionKind; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.apache.orc.RecordReader; -import org.apache.orc.TypeDescription; -import org.apache.orc.Writer; -import org.apache.orc.bench.convert.BatchReader; -import org.apache.orc.bench.convert.GenerateVariants; -import org.apache.orc.bench.convert.csv.CsvReader; -import org.apache.parquet.hadoop.ParquetInputFormat; -import org.openjdk.jmh.annotations.AuxCounters; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Level; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.OutputTimeUnit; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; -import org.openjdk.jmh.runner.Runner; -import org.openjdk.jmh.runner.options.OptionsBuilder; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URI; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.TimeUnit; - -@BenchmarkMode(Mode.AverageTime) -@Warmup(iterations=2, time=30, timeUnit = TimeUnit.SECONDS) -@Measurement(iterations=10, time=30, timeUnit = TimeUnit.SECONDS) -@State(Scope.Thread) -@OutputTimeUnit(TimeUnit.MICROSECONDS) -@Fork(2) -public class DecimalBench { - - private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir"; - private static final Path root; - static { - String value = System.getProperty(ROOT_ENVIRONMENT_NAME); - root = value == null ? null : new Path(value); - } - - /** - * Abstract out whether we are writing short or long decimals - */ - interface Loader { - /** - * Load the data from the values array into the ColumnVector. - * @param vector the output - * @param values the intput - * @param offset the first input value - * @param length the number of values to copy - */ - void loadData(ColumnVector vector, long[] values, int offset, int length); - } - - static class Decimal64Loader implements Loader { - final int scale; - final int precision; - - Decimal64Loader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void loadData(ColumnVector vector, long[] values, int offset, int length) { - Decimal64ColumnVector v = (Decimal64ColumnVector) vector; - v.ensureSize(length, false); - v.noNulls = true; - for(int p=0; p < length; ++p) { - v.vector[p] = values[p + offset]; - } - v.precision = (short) precision; - v.scale = (short) scale; - } - } - - static class DecimalLoader implements Loader { - final int scale; - final int precision; - - DecimalLoader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public void loadData(ColumnVector vector, long[] values, int offset, int length) { - DecimalColumnVector v = (DecimalColumnVector) vector; - v.noNulls = true; - for(int p=0; p < length; ++p) { - v.vector[p].setFromLongAndScale(values[offset + p], scale); - } - v.precision = (short) precision; - v.scale = (short) scale; - } - } - - @State(Scope.Thread) - public static class OutputState { - - // try both short and long decimals - @Param({"8", "19"}) - public int precision; - - long[] total_amount = new long[1024 * 1024]; - Configuration conf = new Configuration(); - FileSystem fs = new NullFileSystem(); - TypeDescription schema; - VectorizedRowBatch batch; - Loader loader; - - @Setup - public void setup() throws IOException { - schema = TypeDescription.createDecimal() - .withScale(2) - .withPrecision(precision); - loader = precision <= 18 ? - new Decimal64Loader(precision, 2) : - new DecimalLoader(precision, 2); - readCsvData(total_amount, root, "total_amount", conf); - batch = schema.createRowBatchV2(); - } - } - - @Benchmark - public void write(OutputState state) throws Exception { - Writer writer = OrcFile.createWriter(new Path("null"), - OrcFile.writerOptions(state.conf) - .fileSystem(state.fs) - .setSchema(state.schema) - .compress(CompressionKind.NONE)); - int r = 0; - int batchSize = state.batch.getMaxSize(); - while (r < state.total_amount.length) { - state.batch.size = batchSize; - state.loader.loadData(state.batch.cols[0], state.total_amount, r, batchSize); - writer.addRowBatch(state.batch); - r += batchSize; - } - writer.close(); - } - - static void readCsvData(long[] data, - Path root, - String column, - Configuration conf) throws IOException { - TypeDescription schema = Utilities.loadSchema("taxi.schema"); - int row = 0; - int batchPosn = 0; - BatchReader reader = - new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), "csv", - schema, conf, org.apache.orc.bench.CompressionKind.ZLIB); - VectorizedRowBatch batch = schema.createRowBatch(); - batch.size = 0; - TypeDescription columnSchema = schema.findSubtype(column); - DecimalColumnVector cv = (DecimalColumnVector) batch.cols[columnSchema.getId() - 1]; - int scale = columnSchema.getScale(); - while (row < data.length) { - if (batchPosn >= batch.size) { - if (!reader.nextBatch(batch)) { - throw new IllegalArgumentException("Not enough data"); - } - batchPosn = 0; - } - data[row++] = cv.vector[batchPosn++].serialize64(scale); - } - } - - @State(Scope.Thread) - public static class InputState { - - // try both DecimalColumnVector and Decimal64ColumnVector - @Param({"ORIGINAL", "USE_DECIMAL64"}) - public TypeDescription.RowBatchVersion version; - - Configuration conf = new Configuration(); - FileSystem fs; - TypeDescription schema; - VectorizedRowBatch batch; - Path path; - boolean[] include; - Reader reader; - OrcFile.ReaderOptions options; - - @Setup - public void setup() throws IOException { - fs = FileSystem.getLocal(conf).getRaw(); - path = new Path(root, "generated/taxi/orc.none"); - schema = Utilities.loadSchema("taxi.schema"); - batch = schema.createRowBatch(version, 1024); - // only include the columns with decimal values - include = new boolean[schema.getMaximumId() + 1]; - for(TypeDescription child: schema.getChildren()) { - if (child.getCategory() == TypeDescription.Category.DECIMAL) { - include[child.getId()] = true; - } - } - reader = OrcFile.createReader(path, - OrcFile.readerOptions(conf).filesystem(fs)); - // just read the decimal columns from the first stripe - reader.options().include(include).range(0, 1000); - } - } - - @Benchmark - public void read(Blackhole blackhole, InputState state) throws Exception { - RecordReader rows = state.reader.rows(); - while (rows.nextBatch(state.batch)) { - blackhole.consume(state.batch); - } - rows.close(); - } - - public static void main(String[] args) throws Exception { - new Runner(new OptionsBuilder() - .include(DecimalBench.class.getSimpleName()) - .jvmArgs("-server", "-Xms256m", "-Xmx2g", - "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build() - ).run(); - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/Driver.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/Driver.java b/java/bench/src/java/org/apache/orc/bench/Driver.java deleted file mode 100644 index 6a86f90..0000000 --- a/java/bench/src/java/org/apache/orc/bench/Driver.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.orc.bench.convert.GenerateVariants; -import org.apache.orc.bench.convert.ScanVariants; - -import java.util.Arrays; - -/** - * A driver tool to call the various benchmark classes. - */ -public class Driver { - - static CommandLine parseCommandLine(String[] args) throws ParseException { - Options options = new Options() - .addOption("h", "help", false, "Provide help") - .addOption("D", "define", true, "Change configuration settings"); - CommandLine result = new DefaultParser().parse(options, args, true); - if (result.hasOption("help") || result.getArgs().length == 0) { - new HelpFormatter().printHelp("benchmark <command>", options); - System.err.println(); - System.err.println("Commands:"); - System.err.println(" generate - Generate data variants"); - System.err.println(" scan - Scan data variants"); - System.err.println(" read-all - Full table scan benchmark"); - System.err.println(" read-some - Column projection benchmark"); - System.err.println(" decimal - Decimal benchmark"); - System.exit(1); - } - return result; - } - - public static void main(String[] args) throws Exception { - CommandLine cli = parseCommandLine(args); - args = cli.getArgs(); - String command = args[0]; - args = Arrays.copyOfRange(args, 1, args.length); - switch (command) { - case "generate": - GenerateVariants.main(args); - break; - case "scan": - ScanVariants.main(args); - break; - case "read-all": - FullReadBenchmark.main(args); - break; - case "read-some": - ColumnProjectionBenchmark.main(args); - break; - case "decimal": - DecimalBench.main(args); - break; - default: - System.err.println("Unknown command " + command); - System.exit(1); - } - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java b/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java deleted file mode 100644 index 952f18d..0000000 --- a/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import com.google.gson.JsonStreamParser; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumReader; -import org.apache.avro.mapred.FsInput; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.TrackingLocalFileSystem; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; -import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.apache.orc.RecordReader; -import org.apache.orc.TypeDescription; -import org.apache.parquet.hadoop.ParquetInputFormat; -import org.openjdk.jmh.annotations.AuxCounters; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Level; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.OutputTimeUnit; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.runner.Runner; -import org.openjdk.jmh.runner.options.OptionsBuilder; - -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URI; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.TimeUnit; - -@BenchmarkMode(Mode.AverageTime) -@Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS) -@Measurement(iterations=3, time=10, timeUnit = TimeUnit.SECONDS) -@State(Scope.Thread) -@OutputTimeUnit(TimeUnit.MICROSECONDS) -@Fork(1) -public class FullReadBenchmark { - - private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir"; - private static final Path root; - static { - String value = System.getProperty(ROOT_ENVIRONMENT_NAME); - root = value == null ? null : new Path(value); - } - - @Param({"taxi", "sales", "github"}) - public String dataset; - - @Param({"none", "zlib", "snappy"}) - public String compression; - - @AuxCounters - @State(Scope.Thread) - public static class ExtraCounters { - long bytesRead; - long reads; - long records; - long invocations; - - @Setup(Level.Iteration) - public void clean() { - bytesRead = 0; - reads = 0; - records = 0; - invocations = 0; - } - - @TearDown(Level.Iteration) - public void print() { - System.out.println(); - System.out.println("Reads: " + reads); - System.out.println("Bytes: " + bytesRead); - System.out.println("Records: " + records); - System.out.println("Invocations: " + invocations); - } - - public long kilobytes() { - return bytesRead / 1024; - } - - public long records() { - return records; - } - } - - @Benchmark - public void orc(ExtraCounters counters) throws Exception{ - Configuration conf = new Configuration(); - TrackingLocalFileSystem fs = new TrackingLocalFileSystem(); - fs.initialize(new URI("file:///"), conf); - FileSystem.Statistics statistics = fs.getLocalStatistics(); - statistics.reset(); - OrcFile.ReaderOptions options = OrcFile.readerOptions(conf).filesystem(fs); - Path path = Utilities.getVariant(root, dataset, "orc", compression); - Reader reader = OrcFile.createReader(path, options); - TypeDescription schema = reader.getSchema(); - RecordReader rows = reader.rows(); - VectorizedRowBatch batch = schema.createRowBatch(); - while (rows.nextBatch(batch)) { - counters.records += batch.size; - } - rows.close(); - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - - @Benchmark - public void avro(ExtraCounters counters) throws Exception { - Configuration conf = new Configuration(); - conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); - conf.set("fs.defaultFS", "track:///"); - Path path = Utilities.getVariant(root, dataset, "avro", compression); - FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", - TrackingLocalFileSystem.class); - statistics.reset(); - FsInput file = new FsInput(path, conf); - DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - DataFileReader<GenericRecord> dataFileReader = - new DataFileReader<>(file, datumReader); - GenericRecord record = null; - while (dataFileReader.hasNext()) { - record = dataFileReader.next(record); - counters.records += 1; - } - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - - @Benchmark - public void parquet(ExtraCounters counters) throws Exception { - JobConf conf = new JobConf(); - conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); - conf.set("fs.defaultFS", "track:///"); - Path path = Utilities.getVariant(root, dataset, "parquet", compression); - FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", - TrackingLocalFileSystem.class); - statistics.reset(); - ParquetInputFormat<ArrayWritable> inputFormat = - new ParquetInputFormat<>(DataWritableReadSupport.class); - - NullWritable nada = NullWritable.get(); - FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{}); - org.apache.hadoop.mapred.RecordReader<NullWritable,ArrayWritable> recordReader = - new ParquetRecordReaderWrapper(inputFormat, split, conf, Reporter.NULL); - ArrayWritable value = recordReader.createValue(); - while (recordReader.next(nada, value)) { - counters.records += 1; - } - recordReader.close(); - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - - @Benchmark - public void json(ExtraCounters counters) throws Exception { - Configuration conf = new Configuration(); - TrackingLocalFileSystem fs = new TrackingLocalFileSystem(); - fs.initialize(new URI("file:///"), conf); - FileSystem.Statistics statistics = fs.getLocalStatistics(); - statistics.reset(); - Path path = Utilities.getVariant(root, dataset, "json", compression); - CompressionKind compress = CompressionKind.valueOf(compression); - InputStream input = compress.read(fs.open(path)); - JsonStreamParser parser = - new JsonStreamParser(new InputStreamReader(input, - StandardCharsets.UTF_8)); - while (parser.hasNext()) { - parser.next(); - counters.records += 1; - } - counters.bytesRead += statistics.getBytesRead(); - counters.reads += statistics.getReadOps(); - counters.invocations += 1; - } - - public static void main(String[] args) throws Exception { - new Runner(new OptionsBuilder() - .include(FullReadBenchmark.class.getSimpleName()) - .addProfiler("hs_gc") - .jvmArgs("-server", "-Xms256m", "-Xmx2g", - "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build() - ).run(); - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java b/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java deleted file mode 100644 index 23d19cc..0000000 --- a/java/bench/src/java/org/apache/orc/bench/NullFileSystem.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.util.Progressable; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URI; -import java.net.URISyntaxException; - -public class NullFileSystem extends FileSystem { - @Override - public URI getUri() { - try { - return new URI("null:///"); - } catch (URISyntaxException e) { - throw new IllegalArgumentException("Bad URL", e); - } - } - - @Override - public FSDataInputStream open(Path path, int i) throws IOException { - return new FSDataInputStream(new InputStream() { - @Override - public int read() throws IOException { - return -1; - } - }); - } - - static class NullOutput extends OutputStream { - - @Override - public void write(int b) { - // pass - } - - public void write(byte[] buffer, int offset, int length) { - // pass - } - } - private static final OutputStream NULL_OUTPUT = new NullOutput(); - - @Override - public FSDataOutputStream create(Path path, - FsPermission fsPermission, - boolean b, - int i, - short i1, - long l, - Progressable progressable) throws IOException { - return new FSDataOutputStream(NULL_OUTPUT); - } - - @Override - public FSDataOutputStream append(Path path, - int i, - Progressable progressable) throws IOException { - return new FSDataOutputStream(NULL_OUTPUT); - } - - @Override - public boolean rename(Path path, Path path1) { - return false; - } - - @Override - public boolean delete(Path path, boolean b) { - return false; - } - - @Override - public FileStatus[] listStatus(Path path) { - return null; - } - - @Override - public void setWorkingDirectory(Path path) { - // pass - } - - @Override - public Path getWorkingDirectory() { - return null; - } - - @Override - public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException { - return false; - } - - @Override - public FileStatus getFileStatus(Path path) throws IOException { - return null; - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java deleted file mode 100644 index dfe7d43..0000000 --- a/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java +++ /dev/null @@ -1,524 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.TypeDescription; - -import java.nio.charset.StandardCharsets; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.List; -import java.util.Random; - -public class RandomGenerator { - private final TypeDescription schema = TypeDescription.createStruct(); - private final List<Field> fields = new ArrayList<>(); - private final Random random; - - public RandomGenerator(int seed) { - random = new Random(seed); - } - - private abstract class ValueGenerator { - double nullProbability = 0; - abstract void generate(ColumnVector vector, int valueCount); - } - - private class RandomBoolean extends ValueGenerator { - public void generate(ColumnVector v, int valueCount) { - LongColumnVector vector = (LongColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - vector.vector[r] = random.nextInt(2); - } - } - } - } - - private class RandomList extends ValueGenerator { - private final int minSize; - private final int sizeRange; - private final Field child; - - public RandomList(int minSize, int maxSize, Field child) { - this.minSize = minSize; - this.sizeRange = maxSize - minSize + 1; - this.child = child; - } - - public void generate(ColumnVector v, int valueCount) { - ListColumnVector vector = (ListColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - vector.offsets[r] = vector.childCount; - vector.lengths[r] = random.nextInt(sizeRange) + minSize; - vector.childCount += vector.lengths[r]; - } - } - vector.child.ensureSize(vector.childCount, false); - child.generator.generate(vector.child, vector.childCount); - } - } - - private class RandomStruct extends ValueGenerator { - private final Field[] children; - - public RandomStruct(Field[] children) { - this.children = children; - } - - public void generate(ColumnVector v, int valueCount) { - StructColumnVector vector = (StructColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } - } - for(int c=0; c < children.length; ++c) { - children[c].generator.generate(vector.fields[c], valueCount); - } - } - } - - private abstract class IntegerGenerator extends ValueGenerator { - private final long sign; - private final long mask; - - private IntegerGenerator(TypeDescription.Category kind) { - int bits = getIntegerLength(kind); - mask = bits == 64 ? 0 : -1L << bits; - sign = 1L << (bits - 1); - } - - protected void normalize(LongColumnVector vector, int valueCount) { - // make sure the value stays in range by sign extending it - for(int r=0; r < valueCount; ++r) { - if ((vector.vector[r] & sign) == 0) { - vector.vector[r] &= ~mask; - } else { - vector.vector[r] |= mask; - } - } - } - } - - private class AutoIncrement extends IntegerGenerator { - private long value; - private final long increment; - - private AutoIncrement(TypeDescription.Category kind, long start, - long increment) { - super(kind); - this.value = start; - this.increment = increment; - } - - public void generate(ColumnVector v, int valueCount) { - LongColumnVector vector = (LongColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() >= nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - vector.vector[r] = value; - value += increment; - } - } - normalize(vector, valueCount); - } - } - - private class RandomInteger extends IntegerGenerator { - - private RandomInteger(TypeDescription.Category kind) { - super(kind); - } - - public void generate(ColumnVector v, int valueCount) { - LongColumnVector vector = (LongColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - vector.vector[r] = random.nextLong(); - } - } - normalize(vector, valueCount); - } - } - - private class IntegerRange extends IntegerGenerator { - private final long minimum; - private final long range; - private final long limit; - - private IntegerRange(TypeDescription.Category kind, long minimum, - long maximum) { - super(kind); - this.minimum = minimum; - this.range = maximum - minimum + 1; - if (this.range < 0) { - throw new IllegalArgumentException("Can't support a negative range " - + range); - } - limit = (Long.MAX_VALUE / range) * range; - } - - public void generate(ColumnVector v, int valueCount) { - LongColumnVector vector = (LongColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - long rand; - do { - // clear the sign bit - rand = random.nextLong() & Long.MAX_VALUE; - } while (rand >= limit); - vector.vector[r] = (rand % range) + minimum; - } - } - normalize(vector, valueCount); - } - } - - private class StringChooser extends ValueGenerator { - private final byte[][] choices; - private StringChooser(String[] values) { - choices = new byte[values.length][]; - for(int e=0; e < values.length; ++e) { - choices[e] = values[e].getBytes(StandardCharsets.UTF_8); - } - } - - public void generate(ColumnVector v, int valueCount) { - BytesColumnVector vector = (BytesColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - int val = random.nextInt(choices.length); - vector.setRef(r, choices[val], 0, choices[val].length); - } - } - } - } - - private static byte[] concat(byte[] left, byte[] right) { - byte[] result = new byte[left.length + right.length]; - System.arraycopy(left, 0, result, 0, left.length); - System.arraycopy(right, 0, result, left.length, right.length); - return result; - } - - private static byte pickOne(byte[] choices, Random random) { - return choices[random.nextInt(choices.length)]; - } - - private static final byte[] LOWER_CONSONANTS = - "bcdfghjklmnpqrstvwxyz".getBytes(StandardCharsets.UTF_8); - private static final byte[] UPPER_CONSONANTS = - "BCDFGHJKLMNPQRSTVWXYZ".getBytes(StandardCharsets.UTF_8); - private static final byte[] CONSONANTS = - concat(LOWER_CONSONANTS, UPPER_CONSONANTS); - private static final byte[] LOWER_VOWELS = "aeiou".getBytes(StandardCharsets.UTF_8); - private static final byte[] UPPER_VOWELS = "AEIOU".getBytes(StandardCharsets.UTF_8); - private static final byte[] VOWELS = concat(LOWER_VOWELS, UPPER_VOWELS); - private static final byte[] LOWER_LETTERS = - concat(LOWER_CONSONANTS, LOWER_VOWELS); - private static final byte[] UPPER_LETTERS = - concat(UPPER_CONSONANTS, UPPER_VOWELS); - private static final byte[] LETTERS = concat(LOWER_LETTERS, UPPER_LETTERS); - private static final byte[] NATURAL_DIGITS = "123456789".getBytes(StandardCharsets.UTF_8); - private static final byte[] DIGITS = "0123456789".getBytes(StandardCharsets.UTF_8); - - private class StringPattern extends ValueGenerator { - private final byte[] buffer; - private final byte[][] choices; - private final int[] locations; - - private StringPattern(String pattern) { - buffer = pattern.getBytes(StandardCharsets.UTF_8); - int locs = 0; - for(int i=0; i < buffer.length; ++i) { - switch (buffer[i]) { - case 'C': - case 'c': - case 'E': - case 'V': - case 'v': - case 'F': - case 'l': - case 'L': - case 'D': - case 'x': - case 'X': - locs += 1; - break; - default: - break; - } - } - locations = new int[locs]; - choices = new byte[locs][]; - locs = 0; - for(int i=0; i < buffer.length; ++i) { - switch (buffer[i]) { - case 'C': - locations[locs] = i; - choices[locs++] = UPPER_CONSONANTS; - break; - case 'c': - locations[locs] = i; - choices[locs++] = LOWER_CONSONANTS; - break; - case 'E': - locations[locs] = i; - choices[locs++] = CONSONANTS; - break; - case 'V': - locations[locs] = i; - choices[locs++] = UPPER_VOWELS; - break; - case 'v': - locations[locs] = i; - choices[locs++] = LOWER_VOWELS; - break; - case 'F': - locations[locs] = i; - choices[locs++] = VOWELS; - break; - case 'l': - locations[locs] = i; - choices[locs++] = LOWER_LETTERS; - break; - case 'L': - locations[locs] = i; - choices[locs++] = UPPER_LETTERS; - break; - case 'D': - locations[locs] = i; - choices[locs++] = LETTERS; - break; - case 'x': - locations[locs] = i; - choices[locs++] = NATURAL_DIGITS; - break; - case 'X': - locations[locs] = i; - choices[locs++] = DIGITS; - break; - default: - break; - } - } - } - - public void generate(ColumnVector v, int valueCount) { - BytesColumnVector vector = (BytesColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - for(int m=0; m < locations.length; ++m) { - buffer[locations[m]] = pickOne(choices[m], random); - } - vector.setVal(r, buffer, 0, buffer.length); - } - } - } - } - - private class TimestampRange extends ValueGenerator { - private final long minimum; - private final long range; - private final long limit; - - private TimestampRange(String min, String max) { - minimum = Timestamp.valueOf(min).getTime(); - range = Timestamp.valueOf(max).getTime() - minimum + 1; - if (range < 0) { - throw new IllegalArgumentException("Negative range " + range); - } - limit = (Long.MAX_VALUE / range) * range; - } - - public void generate(ColumnVector v, int valueCount) { - TimestampColumnVector vector = (TimestampColumnVector) v; - for(int r=0; r < valueCount; ++r) { - if (nullProbability != 0 && random.nextDouble() < nullProbability) { - v.noNulls = false; - v.isNull[r] = true; - } else { - long rand; - do { - // clear the sign bit - rand = random.nextLong() & Long.MAX_VALUE; - } while (rand >= limit); - vector.time[r] = (rand % range) + minimum; - vector.nanos[r] = random.nextInt(1000000); - } - } - } - } - - private static int getIntegerLength(TypeDescription.Category kind) { - switch (kind) { - case BYTE: - return 8; - case SHORT: - return 16; - case INT: - return 32; - case LONG: - return 64; - default: - throw new IllegalArgumentException("Unhandled type " + kind); - } - } - - public class Field { - private final TypeDescription type; - private Field[] children; - private ValueGenerator generator; - - private Field(TypeDescription type) { - this.type = type; - if (!type.getCategory().isPrimitive()) { - List<TypeDescription> childrenTypes = type.getChildren(); - children = new Field[childrenTypes.size()]; - for(int c=0; c < children.length; ++c) { - children[c] = new Field(childrenTypes.get(c)); - } - } - } - - public Field addAutoIncrement(long start, long increment) { - generator = new AutoIncrement(type.getCategory(), start, increment); - return this; - } - - public Field addIntegerRange(long min, long max) { - generator = new IntegerRange(type.getCategory(), min, max); - return this; - } - - public Field addRandomInt() { - generator = new RandomInteger(type.getCategory()); - return this; - } - - public Field addStringChoice(String... choices) { - if (type.getCategory() != TypeDescription.Category.STRING) { - throw new IllegalArgumentException("Must be string - " + type); - } - generator = new StringChooser(choices); - return this; - } - - public Field addStringPattern(String pattern) { - if (type.getCategory() != TypeDescription.Category.STRING) { - throw new IllegalArgumentException("Must be string - " + type); - } - generator = new StringPattern(pattern); - return this; - } - - public Field addTimestampRange(String start, String end) { - if (type.getCategory() != TypeDescription.Category.TIMESTAMP) { - throw new IllegalArgumentException("Must be timestamp - " + type); - } - generator = new TimestampRange(start, end); - return this; - } - - public Field addBoolean() { - if (type.getCategory() != TypeDescription.Category.BOOLEAN) { - throw new IllegalArgumentException("Must be boolean - " + type); - } - generator = new RandomBoolean(); - return this; - } - - public Field hasNulls(double probability) { - generator.nullProbability = probability; - return this; - } - - public Field addStruct() { - generator = new RandomStruct(children); - return this; - } - - public Field addList(int minSize, int maxSize) { - generator = new RandomList(minSize, maxSize, children[0]); - return this; - } - - public Field getChildField(int child) { - return children[child]; - } - } - - public Field addField(String name, TypeDescription.Category kind) { - TypeDescription type = new TypeDescription(kind); - return addField(name, type); - } - - public Field addField(String name, TypeDescription type) { - schema.addField(name, type); - Field result = new Field(type); - fields.add(result); - return result; - } - - public void generate(VectorizedRowBatch batch, int rowCount) { - batch.reset(); - for(int c=0; c < batch.cols.length; ++c) { - fields.get(c).generator.generate(batch.cols[c], rowCount); - } - batch.size = rowCount; - } - - /** - * Get the schema for the table that is being generated. - * @return - */ - public TypeDescription getSchema() { - return schema; - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java b/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java deleted file mode 100644 index 2be3537..0000000 --- a/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java +++ /dev/null @@ -1,206 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.TypeDescription; -import org.apache.orc.bench.convert.BatchReader; - -public class SalesGenerator implements BatchReader { - private final RandomGenerator generator; - private long rowsRemaining; - private final static double MOSTLY = 0.99999; - - public SalesGenerator(long rows) { - this(rows, 42); - } - - public SalesGenerator(long rows, int seed) { - generator = new RandomGenerator(seed); - // column 1 - generator.addField("sales_id", TypeDescription.Category.LONG) - .addAutoIncrement(1000000000, 1); - generator.addField("customer_id", TypeDescription.Category.LONG) - .addIntegerRange(1000000000, 2000000000); - generator.addField("col3", TypeDescription.Category.LONG) - .addIntegerRange(1, 10000).hasNulls(0.9993100389335173); - - // column 4 - generator.addField("item_category", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000).hasNulls(0.00014784879996054823); - generator.addField("item_count", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000); - generator.addField("change_ts", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59"); - - // column 7 - generator.addField("store_location", TypeDescription.Category.STRING) - .addStringChoice("Los Angeles", "New York", "Cupertino", "Sunnyvale", - "Boston", "Chicago", "Seattle", "Jackson", - "Palo Alto", "San Mateo", "San Jose", "Santa Clara", - "Irvine", "Torrance", "Gardena", "Hermosa", "Manhattan") - .hasNulls(0.0004928293332019384); - generator.addField("associate_id", TypeDescription.Category.STRING) - .addStringPattern("MR V").hasNulls(0.05026859198659506); - generator.addField("col9", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000000).hasNulls(MOSTLY); - - // column 10 - generator.addField("rebate_id", TypeDescription.Category.STRING) - .addStringPattern("xxxxxx").hasNulls(MOSTLY); - generator.addField("create_ts", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59"); - generator.addField("col13", TypeDescription.Category.LONG) - .addIntegerRange(1, 100000).hasNulls(MOSTLY); - - // column 13 - generator.addField("size", TypeDescription.Category.STRING) - .addStringChoice("Small", "Medium", "Large", "XL") - .hasNulls(0.9503720861465674); - generator.addField("col14", TypeDescription.Category.LONG) - .addIntegerRange(1, 100000); - generator.addField("fulfilled", TypeDescription.Category.BOOLEAN) - .addBoolean(); - - // column 16 - generator.addField("global_id", TypeDescription.Category.STRING) - .addStringPattern("xxxxxxxxxxxxxxxx").hasNulls(0.021388793060962974); - generator.addField("col17", TypeDescription.Category.STRING) - .addStringPattern("L-xx").hasNulls(MOSTLY); - generator.addField("col18", TypeDescription.Category.STRING) - .addStringPattern("ll").hasNulls(MOSTLY); - - // column 19 - generator.addField("col19", TypeDescription.Category.LONG) - .addIntegerRange(1, 100000); - generator.addField("has_rebate", TypeDescription.Category.BOOLEAN) - .addBoolean(); - RandomGenerator.Field list = - generator.addField("col21", - TypeDescription.fromString("array<struct<sub1:bigint,sub2:string," + - "sub3:string,sub4:bigint,sub5:bigint,sub6:string>>")) - .addList(0, 3) - .hasNulls(MOSTLY); - RandomGenerator.Field struct = list.getChildField(0).addStruct(); - struct.getChildField(0).addIntegerRange(0, 10000000); - struct.getChildField(1).addStringPattern("VVVVV"); - struct.getChildField(2).addStringPattern("VVVVVVVV"); - struct.getChildField(3).addIntegerRange(0, 10000000); - struct.getChildField(4).addIntegerRange(0, 10000000); - struct.getChildField(5).addStringPattern("VVVVVVVV"); - - // column 38 - generator.addField("vendor_id", TypeDescription.Category.STRING) - .addStringPattern("Lxxxxxx").hasNulls(0.1870780148834459); - generator.addField("country", TypeDescription.Category.STRING) - .addStringChoice("USA", "Germany", "Ireland", "Canada", "Mexico", - "Denmark").hasNulls(0.0004928293332019384); - - // column 40 - generator.addField("backend_version", TypeDescription.Category.STRING) - .addStringPattern("X.xx").hasNulls(0.0005913951998423039); - generator.addField("col41", TypeDescription.Category.LONG) - .addIntegerRange(1000000000, 100000000000L); - generator.addField("col42", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000000); - - // column 43 - generator.addField("col43", TypeDescription.Category.LONG) - .addIntegerRange(1000000000, 10000000000L).hasNulls(0.9763934749396284); - generator.addField("col44", TypeDescription.Category.LONG) - .addIntegerRange(1, 100000000); - generator.addField("col45", TypeDescription.Category.LONG) - .addIntegerRange(1, 100000000); - - // column 46 - generator.addField("col46", TypeDescription.Category.LONG) - .addIntegerRange(1, 10000000); - generator.addField("col47", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000); - generator.addField("col48", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000).hasNulls(MOSTLY); - - // column 49 - generator.addField("col49", TypeDescription.Category.STRING) - .addStringPattern("xxxx").hasNulls(0.0004928293332019384); - generator.addField("col50", TypeDescription.Category.STRING) - .addStringPattern("ll").hasNulls(0.9496821250800848); - generator.addField("col51", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000).hasNulls(0.9999014341333596); - - // column 52 - generator.addField("col52", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000).hasNulls(0.9980779656005125); - generator.addField("col53", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000000); - generator.addField("col54", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000000); - - // column 55 - generator.addField("col55", TypeDescription.Category.STRING) - .addStringChoice("X"); - generator.addField("col56", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59"); - generator.addField("col57", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59"); - - // column 58 - generator.addField("md5", TypeDescription.Category.LONG) - .addRandomInt(); - generator.addField("col59", TypeDescription.Category.LONG) - .addIntegerRange(1000000000, 10000000000L); - generator.addField("col69", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59") - .hasNulls(MOSTLY); - - // column 61 - generator.addField("col61", TypeDescription.Category.STRING) - .addStringPattern("X.xx").hasNulls(0.11399142476960233); - generator.addField("col62", TypeDescription.Category.STRING) - .addStringPattern("X.xx").hasNulls(0.9986200778670347); - generator.addField("col63", TypeDescription.Category.TIMESTAMP) - .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59"); - - // column 64 - generator.addField("col64", TypeDescription.Category.LONG) - .addIntegerRange(1, 1000000).hasNulls(MOSTLY); - rowsRemaining = rows; - } - - public boolean nextBatch(VectorizedRowBatch batch) { - int rows = (int) Math.min(batch.getMaxSize(), rowsRemaining); - generator.generate(batch, rows); - rowsRemaining -= rows; - return rows != 0; - } - - @Override - public void close() { - // PASS - } - - public TypeDescription getSchema() { - return generator.getSchema(); - } - - public static void main(String[] args) throws Exception { - SalesGenerator sales = new SalesGenerator(10, 42); - System.out.println("Schema " + sales.getSchema()); - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/Utilities.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/Utilities.java b/java/bench/src/java/org/apache/orc/bench/Utilities.java deleted file mode 100644 index 7016f5e..0000000 --- a/java/bench/src/java/org/apache/orc/bench/Utilities.java +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.orc.TypeDescription; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Properties; - -public class Utilities { - - public static TypeDescription loadSchema(String name) throws IOException { - InputStream in = Utilities.class.getClassLoader().getResourceAsStream(name); - byte[] buffer= new byte[1 * 1024]; - int len = in.read(buffer); - StringBuilder string = new StringBuilder(); - while (len > 0) { - for(int i=0; i < len; ++i) { - // strip out - if (buffer[i] != '\n' && buffer[i] != ' ') { - string.append((char) buffer[i]); - } - } - len = in.read(buffer); - } - return TypeDescription.fromString(string.toString()); - } - - public static org.apache.orc.CompressionKind getCodec(CompressionKind compression) { - switch (compression) { - case NONE: - return org.apache.orc.CompressionKind.NONE; - case ZLIB: - return org.apache.orc.CompressionKind.ZLIB; - case SNAPPY: - return org.apache.orc.CompressionKind.SNAPPY; - default: - throw new IllegalArgumentException("Unknown compression " + compression); - } - } - - public static Iterable<String> sliceArray(final String[] array, - final int start) { - return new Iterable<String>() { - String[] values = array; - int posn = start; - - @Override - public Iterator<String> iterator() { - return new Iterator<String>() { - @Override - public boolean hasNext() { - return posn < values.length; - } - - @Override - public String next() { - if (posn >= values.length) { - throw new NoSuchElementException("Index off end of array." + - " index = " + posn + " length = " + values.length); - } else { - return values[posn++]; - } - } - - @Override - public void remove() { - throw new UnsupportedOperationException("No remove"); - } - }; - } - }; - } - - public static Properties convertSchemaToHiveConfig(TypeDescription schema) { - Properties result = new Properties(); - if (schema.getCategory() != TypeDescription.Category.STRUCT) { - throw new IllegalArgumentException("Hive requires struct root types" + - " instead of " + schema); - } - StringBuilder columns = new StringBuilder(); - StringBuilder types = new StringBuilder(); - List<String> columnNames = schema.getFieldNames(); - List<TypeDescription> columnTypes = schema.getChildren(); - for(int c=0; c < columnNames.size(); ++c) { - if (c != 0) { - columns.append(","); - types.append(","); - } - columns.append(columnNames.get(c)); - types.append(columnTypes.get(c)); - } - result.setProperty(serdeConstants.LIST_COLUMNS, columns.toString()); - result.setProperty(serdeConstants.LIST_COLUMN_TYPES, types.toString()); - return result; - } - - public static Path getVariant(Path root, - String data, - String format, - String compress) { - return new Path(root, "generated/" + data + "/" + format + "." + compress); - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java b/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java deleted file mode 100644 index b9ea356..0000000 --- a/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench.convert; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; - -import java.io.IOException; - -/** - * Generic interface for reading data. - */ -public interface BatchReader extends AutoCloseable { - - boolean nextBatch(VectorizedRowBatch batch) throws IOException; - - @Override - void close() throws IOException; -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java deleted file mode 100644 index c79d937..0000000 --- a/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench.convert; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; - -import java.io.IOException; - -/** - * Generic interface for writing data. - */ -public interface BatchWriter extends AutoCloseable { - - void writeBatch(VectorizedRowBatch batch) throws IOException; - - @Override - void close() throws IOException; -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java b/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java deleted file mode 100644 index 57cf4c9..0000000 --- a/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench.convert; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.TypeDescription; -import org.apache.orc.bench.CompressionKind; -import org.apache.orc.bench.SalesGenerator; -import org.apache.orc.bench.Utilities; -import org.apache.orc.bench.convert.avro.AvroReader; -import org.apache.orc.bench.convert.avro.AvroWriter; -import org.apache.orc.bench.convert.csv.CsvReader; -import org.apache.orc.bench.convert.json.JsonReader; -import org.apache.orc.bench.convert.json.JsonWriter; -import org.apache.orc.bench.convert.orc.OrcReader; -import org.apache.orc.bench.convert.orc.OrcWriter; -import org.apache.orc.bench.convert.parquet.ParquetReader; -import org.apache.orc.bench.convert.parquet.ParquetWriter; - -import java.io.IOException; - -/** - * A tool to create the different variants that we need to benchmark against. - */ -public class GenerateVariants { - - public static BatchWriter createFileWriter(Path file, - String format, - TypeDescription schema, - Configuration conf, - CompressionKind compress - ) throws IOException { - FileSystem fs = file.getFileSystem(conf); - fs.delete(file, false); - fs.mkdirs(file.getParent()); - switch (format) { - case "json": - return new JsonWriter(file, schema, conf, compress); - case "orc": - return new OrcWriter(file, schema, conf, compress); - case "avro": - return new AvroWriter(file, schema, conf, compress); - case "parquet": - return new ParquetWriter(file, schema, conf, compress); - default: - throw new IllegalArgumentException("Unknown format " + format); - } - } - - public static BatchReader createFileReader(Path file, - String format, - TypeDescription schema, - Configuration conf, - CompressionKind compress - ) throws IOException { - switch (format) { - case "csv": - return new CsvReader(file, schema, conf, compress); - case "json": - return new JsonReader(file, schema, conf, compress); - case "orc": - return new OrcReader(file, schema, conf); - case "avro": - return new AvroReader(file, schema, conf); - case "parquet": - return new ParquetReader(file, schema, conf); - default: - throw new IllegalArgumentException("Unknown format " + format); - } - } - - public static class RecursiveReader implements BatchReader { - private final RemoteIterator<LocatedFileStatus> filenames; - private final String format; - private final TypeDescription schema; - private final Configuration conf; - private final CompressionKind compress; - private BatchReader current = null; - - public RecursiveReader(Path root, - String format, - TypeDescription schema, - Configuration conf, - CompressionKind compress) throws IOException { - FileSystem fs = root.getFileSystem(conf); - filenames = fs.listFiles(root, true); - this.format = format; - this.schema = schema; - this.conf = conf; - this.compress = compress; - } - - @Override - public boolean nextBatch(VectorizedRowBatch batch) throws IOException { - while (current == null || !current.nextBatch(batch)) { - if (filenames.hasNext()) { - LocatedFileStatus next = filenames.next(); - if (next.isFile()) { - current = createFileReader(next.getPath(), format, schema, conf, - compress); - } - } else { - return false; - } - } - return true; - } - - @Override - public void close() throws IOException { - if (current != null) { - current.close(); - } - } - } - - public static BatchReader createReader(Path root, - String dataName, - TypeDescription schema, - Configuration conf, - long salesRecords) throws IOException { - switch (dataName) { - case "taxi": - return new RecursiveReader(new Path(root, "sources/" + dataName), "csv", - schema, conf, CompressionKind.ZLIB); - case "sales": - return new SalesGenerator(salesRecords); - case "github": - return new RecursiveReader(new Path(root, "sources/" + dataName), "json", - schema, conf, CompressionKind.ZLIB); - default: - throw new IllegalArgumentException("Unknown data name " + dataName); - } - } - - static CommandLine parseCommandLine(String[] args) throws ParseException { - Options options = new Options() - .addOption("h", "help", false, "Provide help") - .addOption("c", "compress", true, "List of compression") - .addOption("d", "data", true, "List of data sets") - .addOption("f", "format", true, "List of formats") - .addOption("s", "sales", true, "Number of records for sales"); - CommandLine result = new DefaultParser().parse(options, args); - if (result.hasOption("help") || result.getArgs().length == 0) { - new HelpFormatter().printHelp("convert <root>", options); - System.exit(1); - } - return result; - } - - public static void main(String[] args) throws Exception { - CommandLine cli = parseCommandLine(args); - String[] compressList = - cli.getOptionValue("compress", "none,snappy,zlib").split(","); - String[] dataList = - cli.getOptionValue("data", "taxi,sales,github").split(","); - String[] formatList = - cli.getOptionValue("format", "avro,json,orc,parquet").split(","); - long records = Long.parseLong(cli.getOptionValue("sales", "25000000")); - Configuration conf = new Configuration(); - Path root = new Path(cli.getArgs()[0]); - for(String data: dataList) { - // Set up the reader - TypeDescription schema = Utilities.loadSchema(data + ".schema"); - BatchReader reader = createReader(root, data, schema, conf, records); - - // Set up the writers for each combination - BatchWriter[] writers = new BatchWriter[compressList.length * formatList.length]; - for(int compress=0; compress < compressList.length; ++compress) { - CompressionKind compressionKind = - CompressionKind.valueOf(compressList[compress].toUpperCase()); - for(int format=0; format < formatList.length; ++format) { - Path outPath = Utilities.getVariant(root, data, formatList[format], - compressList[compress]); - writers[compress * formatList.length + format] = - createFileWriter(outPath, formatList[format], schema, conf, - compressionKind); - } - } - - // Copy the rows - VectorizedRowBatch batch = schema.createRowBatch(); - while (reader.nextBatch(batch)) { - for(BatchWriter writer: writers) { - writer.writeBatch(batch); - } - } - reader.close(); - for(BatchWriter writer: writers) { - writer.close(); - } - } - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java ---------------------------------------------------------------------- diff --git a/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java b/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java deleted file mode 100644 index ae76238..0000000 --- a/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench.convert; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.DefaultParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.orc.TypeDescription; -import org.apache.orc.bench.CompressionKind; -import org.apache.orc.bench.Utilities; - -/** - * A tool to create the different variants that we need to benchmark against. - */ -public class ScanVariants { - - - static CommandLine parseCommandLine(String[] args) throws ParseException { - Options options = new Options() - .addOption("h", "help", false, "Provide help") - .addOption("c", "compress", true, "List of compression") - .addOption("d", "data", true, "List of data sets") - .addOption("f", "format", true, "List of formats"); - CommandLine result = new DefaultParser().parse(options, args); - if (result.hasOption("help") || result.getArgs().length == 0) { - new HelpFormatter().printHelp("scan <root>", options); - System.exit(1); - } - return result; - } - - public static void main(String[] args) throws Exception { - CommandLine cli = parseCommandLine(args); - String[] compressList = - cli.getOptionValue("compress", "none,snappy,zlib").split(","); - String[] dataList = - cli.getOptionValue("data", "taxi,sales,github").split(","); - String[] formatList = - cli.getOptionValue("format", "avro,json,orc,parquet").split(","); - Configuration conf = new Configuration(); - Path root = new Path(cli.getArgs()[0]); - for(String data: dataList) { - TypeDescription schema = Utilities.loadSchema(data + ".schema"); - VectorizedRowBatch batch = schema.createRowBatch(); - for (String compress : compressList) { - CompressionKind compressKind = - CompressionKind.valueOf(compress.toUpperCase()); - for (String format : formatList) { - Path filename = Utilities.getVariant(root, data, format, - compress); - BatchReader reader = GenerateVariants.createFileReader(filename, - format, schema, conf, compressKind); - long rows = 0; - long batches = 0; - while (reader.nextBatch(batch)) { - batches += 1; - rows += batch.size; - } - System.out.println(filename + " rows: " + rows + " batches: " - + batches); - reader.close(); - } - } - } - } -}
