[07/15] orc git commit: trying to get it done.
trying to get it done. Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/5ae2d412 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/5ae2d412 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/5ae2d412 Branch: refs/heads/orc-72 Commit: 5ae2d4122b03bf356b5e8c48b9b05b2a4f12da38 Parents: 825a944 Author: Owen O'Malley Authored: Mon Sep 26 09:14:07 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- java/bench/pom.xml | 17 +++- .../java/org/apache/orc/bench/SalesToOrc.java | 2 +- java/pom.xml| 41 3 files changed, 47 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/5ae2d412/java/bench/pom.xml -- diff --git a/java/bench/pom.xml b/java/bench/pom.xml index 019bdf0..738dfb3 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -19,7 +19,7 @@ org.apache.orc orc -1.2.0-SNAPSHOT +1.3.0-SNAPSHOT ../pom.xml @@ -34,19 +34,20 @@ org.apache.orc orc-core - 1.2.0-SNAPSHOT + + + org.apache.orc + orc-tools org.apache.avro avro - 1.8.1 org.apache.commons commons-csv - 1.4 org.apache.hadoop @@ -58,27 +59,19 @@ org.apache.hive - hive-exec - 2.1.0 - - - org.apache.hive hive-storage-api org.openjdk.jmh jmh-core - 1.12 org.openjdk.jmh jmh-generator-annprocess - 1.12 org.slf4j slf4j-simple - 1.7.5 http://git-wip-us.apache.org/repos/asf/orc/blob/5ae2d412/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java index d570728..4e715b2 100644 --- a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java +++ b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java @@ -21,7 +21,7 @@ package org.apache.orc.bench; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.orc.OrcFile; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; http://git-wip-us.apache.org/repos/asf/orc/blob/5ae2d412/java/pom.xml -- diff --git a/java/pom.xml b/java/pom.xml index 5236409..b894b2d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -214,6 +214,11 @@ orc-core 1.3.0-SNAPSHOT + +org.apache.orc +orc-tools +1.3.0-SNAPSHOT + @@ -252,6 +257,16 @@ 0.3 +org.apache.commons +commons-csv +1.4 + + +org.apache.avro +avro +1.8.1 + + org.apache.hadoop hadoop-common ${hadoop.version} @@ -350,15 +365,41 @@ ${storage-api.version} +org.apache.hive +hive-exec +2.1.0 + + +org.apache.calcite +calcite-core + + + + org.codehaus.jettison jettison 1.1 +org.openjdk.jmh +jmh-core +1.12 + + +org.openjdk.jmh +jmh-generator-annprocess +1.12 + + org.slf4j slf4j-api 1.7.5 + +org.slf4j +slf4j-simple +1.7.5 +
[orc] Git Push Summary
Repository: orc Updated Branches: refs/heads/orc-72 [deleted] 7315a0145
[05/15] orc git commit: more updates
more updates Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/1752e172 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/1752e172 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/1752e172 Branch: refs/heads/orc-72 Commit: 1752e172a0f7b16fd193f75eb57d10ca05ab0b91 Parents: 5b37113 Author: Owen O'Malley Authored: Wed Oct 5 16:04:16 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- java/bench/pom.xml | 13 + .../orc/bench/ColumnProjectionBenchmark.java| 10 +- .../org/apache/orc/bench/CompressionKind.java | 56 .../java/org/apache/orc/bench/GithubToAvro.java | 4 +- .../java/org/apache/orc/bench/GithubToJson.java | 21 +- .../java/org/apache/orc/bench/GithubToOrc.java | 6 +- .../org/apache/orc/bench/GithubToParquet.java | 2 +- .../java/org/apache/orc/bench/SalesToJson.java | 16 +- .../java/org/apache/orc/bench/SalesToOrc.java | 2 +- .../java/org/apache/orc/bench/TaxiToJson.java | 64 +--- .../java/org/apache/orc/bench/TaxiToOrc.java| 63 +--- .../java/org/apache/orc/bench/Utilities.java| 86 ++ .../org/apache/orc/bench/avro/AvroWriter.java | 23 -- .../org/apache/orc/bench/json/JsonWriter.java | 69 + .../orc/bench/parquet/ConverterParent.java | 24 ++ .../bench/parquet/DataWritableReadSupport.java | 200 + .../parquet/DataWritableRecordConverter.java| 49 .../bench/parquet/DataWritableWriteSupport.java | 61 .../orc/bench/parquet/ETypeConverter.java | 292 +++ .../parquet/FilterPredicateLeafBuilder.java | 80 + .../bench/parquet/HiveCollectionConverter.java | 196 + .../orc/bench/parquet/HiveGroupConverter.java | 79 + .../orc/bench/parquet/HiveSchemaConverter.java | 140 + .../orc/bench/parquet/HiveStructConverter.java | 192 .../orc/bench/parquet/LeafFilterFactory.java| 200 + .../parquet/MapredParquetOutputFormat.java | 129 .../org/apache/orc/bench/parquet/NanoTime.java | 68 + .../apache/orc/bench/parquet/NanoTimeUtils.java | 113 +++ .../ParquetFilterPredicateConverter.java| 143 + .../parquet/ParquetRecordReaderWrapper.java | 276 ++ .../org/apache/orc/bench/parquet/Repeated.java | 193 java/pom.xml| 5 + .../java/org/apache/orc/tools/PrintData.java| 8 +- 33 files changed, 2699 insertions(+), 184 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/pom.xml -- diff --git a/java/bench/pom.xml b/java/bench/pom.xml index f40f21b..caee888 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -37,6 +37,10 @@ org.apache.orc + orc-mapreduce + + + org.apache.orc orc-tools @@ -71,6 +75,10 @@ parquet-hadoop + org.jodd + jodd-core + + org.openjdk.jmh jmh-core @@ -89,6 +97,11 @@ junit test + + org.apache.orc + orc-mapreduce + 1.3.0-SNAPSHOT + http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java index 4b17819..c53911f 100644 --- a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java +++ b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java @@ -29,8 +29,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.TrackingLocalFileSystem; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; -import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileSplit; @@ -40,6 +38,8 @@ import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; +import org.apache.orc.bench.parquet.DataWritableReadSupport; +import org.apache.orc.bench.parquet.ParquetRecordReaderWrapper; import org.apache.parquet.hadoop.ParquetInputFormat; import org.openjdk.jmh.annotations.AuxCounters; import org.openjdk.jmh.annotations.Benchmark; @@ -58,12 +58,9 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.ru
[14/15] orc git commit: fiddling with avro writer
fiddling with avro writer Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/0e8b4e46 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/0e8b4e46 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/0e8b4e46 Branch: refs/heads/orc-72 Commit: 0e8b4e46f16455cb54fea469bdd51bb9614c634f Parents: 2246b1a Author: Owen O'Malley Authored: Mon Oct 17 10:40:36 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 17 10:40:36 2016 -0700 -- .../java/org/apache/orc/bench/GithubToAvro.java| 2 +- .../src/java/org/apache/orc/bench/SalesToAvro.java | 2 +- .../src/java/org/apache/orc/bench/TaxiToAvro.java | 13 + .../java/org/apache/orc/bench/avro/AvroWriter.java | 17 + 4 files changed, 16 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/0e8b4e46/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java b/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java index eb94ff2..d31f1b6 100644 --- a/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java +++ b/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java @@ -31,7 +31,7 @@ public class GithubToAvro { TypeDescription schema = Utilities.loadSchema("github.schema"); Configuration conf = new Configuration(); AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf, -TaxiToAvro.getCodec(args[1])); +CompressionKind.valueOf(args[1])); VectorizedRowBatch batch = schema.createRowBatch(); for(String inFile: Utilities.sliceArray(args, 2)) { JsonReader reader = new JsonReader(new Path(inFile), conf, schema); http://git-wip-us.apache.org/repos/asf/orc/blob/0e8b4e46/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java b/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java index 900be66..fcfe434 100644 --- a/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java +++ b/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java @@ -31,7 +31,7 @@ public class SalesToAvro { SalesGenerator sales = new SalesGenerator(Long.parseLong(args[2])); TypeDescription schema = sales.getSchema(); AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf, -TaxiToAvro.getCodec(args[1])); +CompressionKind.valueOf(args[1])); VectorizedRowBatch batch = schema.createRowBatch(); while (sales.nextBatch(batch)) { writer.writeBatch(batch); http://git-wip-us.apache.org/repos/asf/orc/blob/0e8b4e46/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java index 9fd2f23..d5eb822 100644 --- a/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java +++ b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java @@ -27,22 +27,11 @@ import org.apache.orc.bench.csv.CsvReader; public class TaxiToAvro { - public static String getCodec(String compression) { -if ("none".equals(compression)) { - return "null"; -} else if ("zlib".equals(compression)) { - return "deflate"; -} else if ("snappy".equals(compression)) { - return "snappy"; -} -throw new IllegalArgumentException("Unknown compression " + compression); - } - public static void main(String[] args) throws Exception { TypeDescription schema = Utilities.loadSchema("nyc-taxi.schema"); Configuration conf = new Configuration(); AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf, -getCodec(args[1])); +CompressionKind.valueOf(args[1])); VectorizedRowBatch batch = schema.createRowBatch(); for(String inFile: Utilities.sliceArray(args, 2)) { CsvReader reader = new CsvReader(new Path(inFile), conf, schema); http://git-wip-us.apache.org/repos/asf/orc/blob/0e8b4e46/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java index 8cc9d06..eeb2fee 100644 --- a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java +++ b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java @@ -37,12 +37,12 @@ import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql
[03/15] orc git commit: ORC-72. Add benchmarks to ORC.
ORC-72. Add benchmarks to ORC. Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/825a9441 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/825a9441 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/825a9441 Branch: refs/heads/orc-72 Commit: 825a9441fdcdb8ceb392ae80bed0324e93f7b07d Parents: 37b939b Author: Owen O'Malley Authored: Tue Jun 14 10:00:15 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- java/bench/.gitignore | 5 + java/bench/fetch-data.sh| 6 + java/bench/pom.xml | 138 .../hadoop/fs/TrackingLocalFileSystem.java | 57 ++ .../hadoop/hive/ql/io/orc/VectorToWritable.java | 70 ++ .../src/java/org/apache/orc/bench/AvroScan.java | 48 ++ .../java/org/apache/orc/bench/AvroWriter.java | 368 ++ .../orc/bench/ColumnProjectionBenchmark.java| 192 + .../java/org/apache/orc/bench/CsvReader.java| 175 + .../src/java/org/apache/orc/bench/CsvScan.java | 40 ++ .../org/apache/orc/bench/FullReadBenchmark.java | 222 ++ .../java/org/apache/orc/bench/GithubToAvro.java | 42 ++ .../java/org/apache/orc/bench/GithubToJson.java | 51 ++ .../java/org/apache/orc/bench/GithubToOrc.java | 48 ++ .../org/apache/orc/bench/GithubToParquet.java | 63 ++ .../java/org/apache/orc/bench/JsonReader.java | 278 .../src/java/org/apache/orc/bench/JsonScan.java | 61 ++ .../src/java/org/apache/orc/bench/OrcScan.java | 46 ++ .../java/org/apache/orc/bench/ParquetScan.java | 54 ++ .../org/apache/orc/bench/RandomGenerator.java | 523 ++ .../org/apache/orc/bench/SalesGenerator.java| 200 ++ .../java/org/apache/orc/bench/SalesToAvro.java | 40 ++ .../java/org/apache/orc/bench/SalesToJson.java | 49 ++ .../java/org/apache/orc/bench/SalesToOrc.java | 42 ++ .../org/apache/orc/bench/SalesToParquet.java| 61 ++ .../java/org/apache/orc/bench/TaxiToAvro.java | 53 ++ .../java/org/apache/orc/bench/TaxiToJson.java | 93 +++ .../java/org/apache/orc/bench/TaxiToOrc.java| 108 +++ .../org/apache/orc/bench/TaxiToParquet.java | 75 ++ java/bench/src/main/resources/github.schema | 702 +++ java/bench/src/main/resources/log4j.properties | 6 + java/bench/src/main/resources/nyc-taxi.schema | 21 + .../java/org/apache/orc/TypeDescription.java| 2 +- java/pom.xml| 1 + .../src/java/org/apache/orc/tools/FileDump.java | 1 - 35 files changed, 3939 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/.gitignore -- diff --git a/java/bench/.gitignore b/java/bench/.gitignore new file mode 100644 index 000..babcae6 --- /dev/null +++ b/java/bench/.gitignore @@ -0,0 +1,5 @@ +.*.crc +*.json.gz +*.avro +*.parquet +*.orc http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/fetch-data.sh -- diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh new file mode 100644 index 000..79e77ff --- /dev/null +++ b/java/bench/fetch-data.sh @@ -0,0 +1,6 @@ +#!/usr/bin/bash +mkdir -p data/nyc +(cd data/nyc; wget https://storage.googleapis.com/tlc-trip-data/2015/yellow_tripdata_2015-{11..12}.csv) +(cd data/nyc; gzip *.csv) +mkdir -p data/github +(cd data/github; wget http://data.githubarchive.org/2015-11-{01..15}-{0..23}.json.gz) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/pom.xml -- diff --git a/java/bench/pom.xml b/java/bench/pom.xml new file mode 100644 index 000..019bdf0 --- /dev/null +++ b/java/bench/pom.xml @@ -0,0 +1,138 @@ + + +http://maven.apache.org/POM/4.0.0"; + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd";> + 4.0.0 + +org.apache.orc +orc +1.2.0-SNAPSHOT +../pom.xml + + + orc-benchmarks + jar + ORC Benchmarks + +Benchmarks for comparing ORC, Parquet, and Avro performance. + + + + + org.apache.orc + orc-core + 1.2.0-SNAPSHOT + + + + + org.apache.avro + avro + 1.8.1 + + + org.apache.commons + commons-csv + 1.4 + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.apache.hive + hive-exec + 2.1.0 + + + org.apache.hive + hive-storage-api + + + org.openjdk.jmh +
[02/15] orc git commit: ORC-72. Add benchmarks to ORC.
http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java new file mode 100644 index 000..fe8f85e --- /dev/null +++ b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java @@ -0,0 +1,523 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.bench; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.TypeDescription; + +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +public class RandomGenerator { + private final TypeDescription schema = TypeDescription.createStruct(); + private final List fields = new ArrayList<>(); + private final Random random; + + public RandomGenerator(int seed) { +random = new Random(seed); + } + + private abstract class ValueGenerator { +double nullProbability = 0; +abstract void generate(ColumnVector vector, int valueCount); + } + + private class RandomBoolean extends ValueGenerator { +public void generate(ColumnVector v, int valueCount) { + LongColumnVector vector = (LongColumnVector) v; + for(int r=0; r < valueCount; ++r) { +if (nullProbability != 0 && random.nextDouble() < nullProbability) { + v.noNulls = false; + v.isNull[r] = true; +} else { + vector.vector[r] = random.nextInt(2); +} + } +} + } + + private class RandomList extends ValueGenerator { +private final int minSize; +private final int sizeRange; +private final Field child; + +public RandomList(int minSize, int maxSize, Field child) { + this.minSize = minSize; + this.sizeRange = maxSize - minSize + 1; + this.child = child; +} + +public void generate(ColumnVector v, int valueCount) { + ListColumnVector vector = (ListColumnVector) v; + for(int r=0; r < valueCount; ++r) { +if (nullProbability != 0 && random.nextDouble() < nullProbability) { + v.noNulls = false; + v.isNull[r] = true; +} else { + vector.offsets[r] = vector.childCount; + vector.lengths[r] = random.nextInt(sizeRange) + minSize; + vector.childCount += vector.lengths[r]; +} + } + vector.child.ensureSize(vector.childCount, false); + child.generator.generate(vector.child, vector.childCount); +} + } + + private class RandomStruct extends ValueGenerator { +private final Field[] children; + +public RandomStruct(Field[] children) { + this.children = children; +} + +public void generate(ColumnVector v, int valueCount) { + StructColumnVector vector = (StructColumnVector) v; + for(int r=0; r < valueCount; ++r) { +if (nullProbability != 0 && random.nextDouble() < nullProbability) { + v.noNulls = false; + v.isNull[r] = true; +} + } + for(int c=0; c < children.length; ++c) { +children[c].generator.generate(vector.fields[c], valueCount); + } +} + } + + private abstract class IntegerGenerator extends ValueGenerator { +private final long sign; +private final long mask; + +private IntegerGenerator(TypeDescription.Category kind) { + int bits = getIntegerLength(kind); + mask = bits == 64 ? 0 : -1L << bits; + sign = 1L << (bits - 1); +} + +protected void normalize(LongColumnVector vector, int valueCount) { + // make sure the value stays in range by sign extending it + for(int r=0; r < valueCount; ++r) { +if ((vector.vector[r] & sign) == 0) { + vector.vector[
[12/15] orc git commit: fix urls
fix urls Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/839d29d7 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/839d29d7 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/839d29d7 Branch: refs/heads/orc-72 Commit: 839d29d743263f7ce8c6464eb8536aaa4a1571bc Parents: 611388b Author: Owen O'Malley Authored: Fri Oct 14 11:16:50 2016 -0700 Committer: Owen O'Malley Committed: Fri Oct 14 11:16:50 2016 -0700 -- java/bench/fetch-data.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/839d29d7/java/bench/fetch-data.sh -- diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh old mode 100644 new mode 100755 index 79e77ff..e139117 --- a/java/bench/fetch-data.sh +++ b/java/bench/fetch-data.sh @@ -1,6 +1,6 @@ #!/usr/bin/bash mkdir -p data/nyc -(cd data/nyc; wget https://storage.googleapis.com/tlc-trip-data/2015/yellow_tripdata_2015-{11..12}.csv) +(cd data/nyc; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-{11,12}.csv) (cd data/nyc; gzip *.csv) mkdir -p data/github -(cd data/github; wget http://data.githubarchive.org/2015-11-{01..15}-{0..23}.json.gz) \ No newline at end of file +(cd data/github; wget http://data.githubarchive.org/2015-11-{01..15}-{0..23}.json.gz)
[15/15] orc git commit: Fiddling with orc writer
Fiddling with orc writer Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/7315a014 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/7315a014 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/7315a014 Branch: refs/heads/orc-72 Commit: 7315a0145efe92cc1e19c2b13a51b1b5d63779ff Parents: 0e8b4e4 Author: Owen O'Malley Authored: Mon Oct 17 11:24:41 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 17 11:24:41 2016 -0700 -- .../java/org/apache/orc/bench/GithubToOrc.java | 2 +- .../java/org/apache/orc/bench/SalesToOrc.java | 3 +-- .../java/org/apache/orc/bench/TaxiToOrc.java| 7 +-- .../java/org/apache/orc/bench/Utilities.java| 20 ++-- 4 files changed, 13 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/7315a014/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java b/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java index a04b08e..9caca4a 100644 --- a/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java +++ b/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java @@ -34,7 +34,7 @@ public class GithubToOrc { Configuration conf = new Configuration(); Writer writer = OrcFile.createWriter(new Path(args[0]), OrcFile.writerOptions(conf).setSchema(schema) -.compress(Utilities.getCodec(args[1]))); +.compress(Utilities.getCodec(CompressionKind.valueOf(args[1]; for(String inFile: Utilities.sliceArray(args, 2)) { JsonReader reader = new JsonReader(new Path(inFile), conf, schema); while (reader.nextBatch(batch)) { http://git-wip-us.apache.org/repos/asf/orc/blob/7315a014/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java index d3b2615..062b863 100644 --- a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java +++ b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java @@ -22,7 +22,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; -import org.apache.orc.TypeDescription; import org.apache.orc.Writer; public class SalesToOrc { @@ -33,7 +32,7 @@ public class SalesToOrc { Writer writer = OrcFile.createWriter(new Path(args[0]), OrcFile.writerOptions(conf) .setSchema(sales.getSchema()) -.compress(Utilities.getCodec(args[1]))); +.compress(Utilities.getCodec(CompressionKind.valueOf(args[1]; while (sales.nextBatch(batch)) { writer.addRowBatch(batch); } http://git-wip-us.apache.org/repos/asf/orc/blob/7315a014/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java index e95f794..89f0d0e 100644 --- a/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java +++ b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java @@ -22,15 +22,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; -import org.apache.orc.CompressionKind; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.bench.csv.CsvReader; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; - public class TaxiToOrc { public static void main(String[] args) throws Exception { @@ -40,7 +35,7 @@ public class TaxiToOrc { Writer writer = OrcFile.createWriter(new Path(args[0]), OrcFile.writerOptions(conf) .setSchema(schema) -.compress(Utilities.getCodec(args[1]))); +.compress(Utilities.getCodec(CompressionKind.valueOf(args[1]; for(String inFile: Utilities.sliceArray(args, 2)) { CsvReader reader = new CsvReader(new Path(inFile), conf, schema); while (reader.nextBatch(batch)) { http://git-wip-us.apache.org/repos/asf/orc/blob/7315a014/java/bench/src/java/org/apache/orc/bench/Utilities.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/Utilities.java b/java/bench/src/java/org/apache/orc/bench/Utilities.java index 9a95ae9..d3e10de 100644 --- a/java/bench/src/java/org/apache/orc/bench/Utilities.java +++ b/java/b
[13/15] orc git commit: minor debugging
minor debugging Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/2246b1a0 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/2246b1a0 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/2246b1a0 Branch: refs/heads/orc-72 Commit: 2246b1a01a26e01e72b31047f85a81c0cf64f08f Parents: 839d29d Author: Owen O'Malley Authored: Fri Oct 14 13:06:36 2016 -0700 Committer: Owen O'Malley Committed: Fri Oct 14 13:07:23 2016 -0700 -- .../java/org/apache/orc/bench/avro/AvroSchemaUtils.java | 12 +++- .../src/java/org/apache/orc/bench/avro/AvroWriter.java | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/2246b1a0/java/bench/src/java/org/apache/orc/bench/avro/AvroSchemaUtils.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/avro/AvroSchemaUtils.java b/java/bench/src/java/org/apache/orc/bench/avro/AvroSchemaUtils.java index 5df7b70..893a4fd 100644 --- a/java/bench/src/java/org/apache/orc/bench/avro/AvroSchemaUtils.java +++ b/java/bench/src/java/org/apache/orc/bench/avro/AvroSchemaUtils.java @@ -110,7 +110,7 @@ public class AvroSchemaUtils { throw new UnsupportedOperationException(typeInfo + " is not supported."); } -return wrapInUnionWithNull(schema); +return schema; } private static Schema createAvroUnion(TypeDescription typeInfo) { @@ -128,7 +128,7 @@ public class AvroSchemaUtils { } } -return Schema.createUnion(childSchemas); +return wrapInUnionWithNull(Schema.createUnion(childSchemas)); } private static Schema createAvroRecord(TypeDescription typeInfo) { @@ -140,7 +140,8 @@ public class AvroSchemaUtils { for (int i = 0; i < fieldNames.size(); ++i) { TypeDescription childTypeInfo = fieldTypes.get(i); Schema.Field field = new Schema.Field(fieldNames.get(i), - createAvroSchema(childTypeInfo), childTypeInfo.toString(), + wrapInUnionWithNull(createAvroSchema(childTypeInfo)), + childTypeInfo.toString(), (Object) null); childFields.add(field); } @@ -158,14 +159,15 @@ public class AvroSchemaUtils { + typeInfo); } -Schema valueSchema = createAvroSchema(typeInfo.getChildren().get(1)); +Schema valueSchema = wrapInUnionWithNull(createAvroSchema +(typeInfo.getChildren().get(1))); return Schema.createMap(valueSchema); } private static Schema createAvroArray(TypeDescription typeInfo) { Schema child = createAvroSchema(typeInfo.getChildren().get(0)); -return Schema.createArray(child); +return Schema.createArray(wrapInUnionWithNull(child)); } private static Schema wrapInUnionWithNull(Schema schema) { http://git-wip-us.apache.org/repos/asf/orc/blob/2246b1a0/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java index 2735a71..8cc9d06 100644 --- a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java +++ b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java @@ -308,6 +308,8 @@ public class AvroWriter { String compression) throws IOException { List childTypes = schema.getChildren(); Schema avroSchema = AvroSchemaUtils.createAvroSchema(schema); +System.out.println("Hive schema " + schema); +System.out.println("Avro schema " + avroSchema); List avroFields = avroSchema.getFields(); converters = new AvroConverter[childTypes.size()]; for(int c=0; c < converters.length; ++c) {
[10/15] orc git commit: more updates
more updates Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/0f56aaad Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/0f56aaad Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/0f56aaad Branch: refs/heads/orc-72 Commit: 0f56aaade0b33c5473d640fd70adb7ee59a694a3 Parents: 73cdb4c Author: Owen O'Malley Authored: Tue Oct 11 15:25:10 2016 -0700 Committer: Owen O'Malley Committed: Tue Oct 11 15:25:10 2016 -0700 -- .../java/org/apache/orc/bench/TaxiToAvro.java | 4 +- .../bench/parquet/DataWritableWriteSupport.java | 18 +- .../orc/bench/parquet/DataWritableWriter.java | 466 --- .../apache/orc/bench/parquet/ParquetScan.java | 6 +- .../apache/orc/bench/parquet/RowInBatch.java| 65 ++- 5 files changed, 283 insertions(+), 276 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/0f56aaad/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java index 2b14f50..9fd2f23 100644 --- a/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java +++ b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java @@ -39,12 +39,12 @@ public class TaxiToAvro { } public static void main(String[] args) throws Exception { -TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema"); +TypeDescription schema = Utilities.loadSchema("nyc-taxi.schema"); Configuration conf = new Configuration(); AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf, getCodec(args[1])); VectorizedRowBatch batch = schema.createRowBatch(); -for(String inFile: TaxiToOrc.sliceArray(args, 2)) { +for(String inFile: Utilities.sliceArray(args, 2)) { CsvReader reader = new CsvReader(new Path(inFile), conf, schema); while (reader.nextBatch(batch)) { writer.writeBatch(batch); http://git-wip-us.apache.org/repos/asf/orc/blob/0f56aaad/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java index f4621e5..2b8a1d3 100644 --- a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java +++ b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.ql.io.parquet.write; +package org.apache.orc.bench.parquet; import java.util.HashMap; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; +import org.apache.orc.TypeDescription; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.io.api.RecordConsumer; import org.apache.parquet.schema.MessageType; @@ -28,15 +28,20 @@ import org.apache.parquet.schema.MessageTypeParser; * DataWritableWriteSupport is a WriteSupport for the DataWritableWriter * */ -public class DataWritableWriteSupport extends WriteSupport { +public class DataWritableWriteSupport extends WriteSupport { public static final String PARQUET_HIVE_SCHEMA = "parquet.hive.schema"; + public static final String HIVE_SCHEMA = "hive.schema"; private DataWritableWriter writer; private MessageType schema; + private TypeDescription hiveType; - public static void setSchema(final MessageType schema, final Configuration configuration) { + public static void setSchema(final MessageType schema, + final TypeDescription hiveType, + final Configuration configuration) { configuration.set(PARQUET_HIVE_SCHEMA, schema.toString()); +configuration.set(HIVE_SCHEMA, hiveType.toString()); } public static MessageType getSchema(final Configuration configuration) { @@ -46,16 +51,17 @@ public class DataWritableWriteSupport extends WriteSupport { @Override public WriteContext init(final Configuration configuration) { schema = getSchema(configuration); +hiveType = TypeDescription.fromString(configuration.get(HIVE_SCHEMA)); return new WriteContext(schema, new HashMap()); } @Override public void prepareForWrite(final RecordConsumer recordConsumer) { -writer = new DataWritableWriter(recordConsumer, schema); +writer = new DataWritableWriter(recordConsumer, schema, hiveType); } @Override - public void write(final ParquetHiveRecord record) { + public void write(fi
[01/15] orc git commit: more updates
Repository: orc Updated Branches: refs/heads/orc-72 [created] 7315a0145 more updates Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/73cdb4c2 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/73cdb4c2 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/73cdb4c2 Branch: refs/heads/orc-72 Commit: 73cdb4c2de03e512d5be28daaf5d1f0f292535b7 Parents: 1752e17 Author: Owen O'Malley Authored: Mon Oct 10 09:30:20 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- .../orc/bench/parquet/DataWritableWriter.java | 550 +++ .../apache/orc/bench/parquet/RowInBatch.java| 33 ++ 2 files changed, 583 insertions(+) -- http://git-wip-us.apache.org/repos/asf/orc/blob/73cdb4c2/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriter.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriter.java b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriter.java new file mode 100644 index 000..220e452 --- /dev/null +++ b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriter.java @@ -0,0 +1,550 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.bench.parquet; + +import org.apache.orc.TypeDescription; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.Type; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Map; + +/** + * + * DataWritableWriter sends a record to the Parquet API with the expected schema in order + * to be written to a file. + * This class is only used through DataWritableWriteSupport class. + */ +public class DataWritableWriter { + private static final Logger LOG = LoggerFactory.getLogger(DataWritableWriter.class); + protected final RecordConsumer recordConsumer; + private final GroupType schema; + private final TypeDescription hiveType; + + /* This writer will be created when writing the first row in order to get + information about how to inspect the record data. */ + private final DataWriter messageWriter; + + public DataWritableWriter(final RecordConsumer recordConsumer, +final GroupType schema, +TypeDescription hiveType) { +this.recordConsumer = recordConsumer; +this.schema = schema; +this.hiveType = hiveType; +messageWriter = createMessageWriter(hiveType, schema); + } + + /** + * It writes a record to Parquet. + * @param record Contains the record that is going to be written. + */ + public void write(final RowInBatch record) { + messageWriter.write(record); + } + + private MessageDataWriter createMessageWriter(TypeDescription hiveType, +GroupType schema) { +return new MessageDataWriter(hiveType, schema); + } + + /** + * Creates a writer for the specific object inspector. The returned writer will be used + * to call Parquet API for the specific data type. + * @param hiveType The type description used to get the correct value type. + * @param type Type that contains information about the type schema. + * @return A ParquetWriter object used to call the Parquet API fo the specific data type. + */ + private DataWriter createWriter(TypeDescription hiveType, Type type) { +switch (hiveType.getCategory()) { + case BOOLEAN: +return new BooleanDataWriter(hiveType); + case BYTE: +return new ByteDataWriter(hiveType); + case SHORT: +return new ShortDataWriter(hiveType); + case INT: +return new IntDataWriter(hiveType); + case LONG: +return new LongDataWriter(hiveType); + case FLOAT: +return new FloatDataWriter(hiveType); + case DOUBLE: +return new DoubleDataWriter(hiveType); + case STRING: +return new StringDataWriter(hiveType); + case CHAR: +return
[11/15] orc git commit: It compiles
It compiles Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/611388b5 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/611388b5 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/611388b5 Branch: refs/heads/orc-72 Commit: 611388b50f8f3e9deeaa0e5335f0afd7f0ab9422 Parents: 0f56aaa Author: Owen O'Malley Authored: Wed Oct 12 16:06:29 2016 -0700 Committer: Owen O'Malley Committed: Wed Oct 12 16:06:29 2016 -0700 -- .../org/apache/orc/bench/CompressionKind.java | 17 ++- .../org/apache/orc/bench/FullReadBenchmark.java | 20 +-- .../org/apache/orc/bench/GithubToParquet.java | 34 ++--- .../org/apache/orc/bench/SalesToParquet.java| 34 + .../org/apache/orc/bench/TaxiToParquet.java | 49 ++- .../java/org/apache/orc/bench/csv/CsvScan.java | 4 +- .../bench/parquet/DataWritableReadSupport.java | 2 - .../bench/parquet/HiveCollectionConverter.java | 2 - .../parquet/MapredParquetOutputFormat.java | 129 --- .../parquet/ParquetRecordReaderWrapper.java | 2 - .../apache/orc/bench/parquet/ParquetWriter.java | 72 +++ 11 files changed, 118 insertions(+), 247 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/611388b5/java/bench/src/java/org/apache/orc/bench/CompressionKind.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java index 9fe9ba9..86ac476 100644 --- a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java +++ b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java @@ -20,11 +20,13 @@ package org.apache.orc.bench; import io.airlift.compress.snappy.SnappyCodec; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; +import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; /** - * Created by owen on 10/5/16. + * Enum for handling the compression codecs for the benchmark */ public enum CompressionKind { NONE(""), @@ -53,4 +55,17 @@ public enum CompressionKind { throw new IllegalArgumentException("Unhandled kind " + this); } } + + public InputStream read(InputStream in) throws IOException { +switch (this) { + case NONE: +return in; + case ZLIB: +return new GZIPInputStream(in); + case SNAPPY: +return new SnappyCodec().createInputStream(in); + default: +throw new IllegalArgumentException("Unhandled kind " + this); +} + } } http://git-wip-us.apache.org/repos/asf/orc/blob/611388b5/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java b/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java index 917707d..849e030 100644 --- a/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java +++ b/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java @@ -29,19 +29,17 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.TrackingLocalFileSystem; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; -import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; +import org.apache.orc.bench.parquet.DataWritableReadSupport; +import org.apache.orc.bench.parquet.ParquetRecordReaderWrapper; import org.apache.parquet.hadoop.ParquetInputFormat; -import io.airlift.compress.snappy.HadoopSnappyInputStream; import org.openjdk.jmh.annotations.AuxCounters; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -63,7 +61,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; import java.util.concurrent.TimeUnit; -import java.util.zip.GZIPInputStream; @BenchmarkMode(Mode.AverageTime) @Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS) @@ -173,8 +170,7 @@ public class FullReadBenchmark { NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{}); org.apache.hadoop.mapred.RecordReader recordReader = -new ParquetRecordReaderWrapper(inputFormat, split, conf, -Reporter.NULL);
[09/15] orc git commit: more updates
more updates Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/5b37113b Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/5b37113b Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/5b37113b Branch: refs/heads/orc-72 Commit: 5b37113b73eb0e12744f2711326e11cd2ef6eaef Parents: 86628bc Author: Owen O'Malley Authored: Mon Oct 3 10:01:40 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- java/bench/pom.xml | 4 + .../src/java/org/apache/orc/bench/AvroScan.java | 47 --- .../org/apache/orc/bench/AvroSchemaUtils.java | 190 -- .../java/org/apache/orc/bench/AvroWriter.java | 375 --- .../java/org/apache/orc/bench/CsvReader.java| 175 - .../src/java/org/apache/orc/bench/CsvScan.java | 40 -- .../java/org/apache/orc/bench/GithubToAvro.java | 2 + .../java/org/apache/orc/bench/GithubToJson.java | 2 +- .../java/org/apache/orc/bench/GithubToOrc.java | 4 +- .../org/apache/orc/bench/GithubToParquet.java | 2 + .../java/org/apache/orc/bench/JsonReader.java | 278 -- .../src/java/org/apache/orc/bench/JsonScan.java | 61 --- .../src/java/org/apache/orc/bench/OrcScan.java | 46 --- .../java/org/apache/orc/bench/ParquetScan.java | 54 --- .../java/org/apache/orc/bench/SalesToAvro.java | 1 + .../org/apache/orc/bench/SalesToParquet.java| 1 + .../java/org/apache/orc/bench/TaxiToAvro.java | 2 + .../java/org/apache/orc/bench/TaxiToJson.java | 1 + .../java/org/apache/orc/bench/TaxiToOrc.java| 1 + .../org/apache/orc/bench/TaxiToParquet.java | 2 + .../org/apache/orc/bench/avro/AvroScan.java | 47 +++ .../apache/orc/bench/avro/AvroSchemaUtils.java | 190 ++ .../org/apache/orc/bench/avro/AvroWriter.java | 375 +++ .../org/apache/orc/bench/csv/CsvReader.java | 175 + .../java/org/apache/orc/bench/csv/CsvScan.java | 41 ++ .../org/apache/orc/bench/json/JsonReader.java | 278 ++ .../org/apache/orc/bench/json/JsonScan.java | 61 +++ .../java/org/apache/orc/bench/orc/OrcScan.java | 46 +++ .../apache/orc/bench/parquet/ParquetScan.java | 54 +++ java/pom.xml| 15 +- 30 files changed, 1295 insertions(+), 1275 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/5b37113b/java/bench/pom.xml -- diff --git a/java/bench/pom.xml b/java/bench/pom.xml index f0bf55a..f40f21b 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -67,6 +67,10 @@ hive-storage-api + org.apache.parquet + parquet-hadoop + + org.openjdk.jmh jmh-core http://git-wip-us.apache.org/repos/asf/orc/blob/5b37113b/java/bench/src/java/org/apache/orc/bench/AvroScan.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/AvroScan.java b/java/bench/src/java/org/apache/orc/bench/AvroScan.java deleted file mode 100644 index 61f6a62..000 --- a/java/bench/src/java/org/apache/orc/bench/AvroScan.java +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.orc.bench; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumReader; -import org.apache.avro.mapred.FsInput; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -public class AvroScan { - public static void main(String[] args) throws Exception { -Configuration conf = new Configuration(); -long rowCount = 0; -for(String filename: args) { - FsInput file = new FsInput(new Path(filename), conf); - DatumReader datumReader = new GenericDatumReader<>(); - DataFileReader dataFileReader = - new DataFileReader<>(file, dat
[06/15] orc git commit: more updates
more updates Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/86628bcb Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/86628bcb Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/86628bcb Branch: refs/heads/orc-72 Commit: 86628bcbffc1d19f8f2f1fe5c840ac9d429d3dc6 Parents: 5ae2d41 Author: Owen O'Malley Authored: Sat Oct 1 10:24:32 2016 -0700 Committer: Owen O'Malley Committed: Mon Oct 10 13:59:16 2016 -0700 -- java/bench/pom.xml | 5 + .../hadoop/hive/ql/io/orc/VectorToWritable.java | 70 --- .../src/java/org/apache/orc/bench/AvroScan.java | 1 - .../org/apache/orc/bench/AvroSchemaUtils.java | 190 +++ .../java/org/apache/orc/bench/AvroWriter.java | 31 +-- .../orc/bench/ColumnProjectionBenchmark.java| 1 - .../org/apache/orc/bench/FullReadBenchmark.java | 4 +- .../java/org/apache/orc/bench/GithubToOrc.java | 2 +- .../java/org/apache/orc/bench/TaxiToOrc.java| 2 +- java/pom.xml| 17 +- 10 files changed, 224 insertions(+), 99 deletions(-) -- http://git-wip-us.apache.org/repos/asf/orc/blob/86628bcb/java/bench/pom.xml -- diff --git a/java/bench/pom.xml b/java/bench/pom.xml index 738dfb3..f0bf55a 100644 --- a/java/bench/pom.xml +++ b/java/bench/pom.xml @@ -46,6 +46,11 @@ avro + org.apache.avro + avro-mapred + hadoop2 + + org.apache.commons commons-csv http://git-wip-us.apache.org/repos/asf/orc/blob/86628bcb/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/VectorToWritable.java -- diff --git a/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/VectorToWritable.java b/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/VectorToWritable.java deleted file mode 100644 index ae8e8da..000 --- a/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/VectorToWritable.java +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.orc; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.orc.OrcProto; -import org.apache.orc.OrcUtils; -import org.apache.orc.TypeDescription; - -import java.util.List; - -/** - * This class is just here to provide a public API to some of the ORC internal - * methods. - */ -public class VectorToWritable { - public static ObjectInspector createObjectInspector(TypeDescription schema) { -// convert the type descr to protobuf types -List types = OrcUtils.getOrcTypes(schema); -// convert the protobuf types to an ObjectInspector -return OrcStruct.createObjectInspector(0, types); - } - - public static Object createValue(VectorizedRowBatch batch, - int row, - TypeDescription schema, - Object previous) { -if(schema.getCategory() == TypeDescription.Category.STRUCT) { - List children = schema.getChildren(); - int numberOfChildren = children.size(); - OrcStruct result; - if(previous != null && previous.getClass() == OrcStruct.class) { -result = (OrcStruct)previous; -if(result.getNumFields() != numberOfChildren) { - result.setNumFields(numberOfChildren); -} - } else { -result = new OrcStruct(numberOfChildren); -previous = result; - } - - for(int i = 0; i < numberOfChildren; ++i) { -result.setFieldValue(i, RecordReaderImpl.nextValue(batch.cols[i], row, -children.get(i), result.getFieldValue(i))); - } -} else { - previous = RecordReaderImpl.nextValue(batch.cols[0], row, schema, - previous); -} -; -return previous; - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/86628bcb/java/bench/sr
[08/15] orc git commit: more updates
http://git-wip-us.apache.org/repos/asf/orc/blob/5b37113b/java/bench/src/java/org/apache/orc/bench/csv/CsvScan.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/csv/CsvScan.java b/java/bench/src/java/org/apache/orc/bench/csv/CsvScan.java new file mode 100644 index 000..ae78cc4 --- /dev/null +++ b/java/bench/src/java/org/apache/orc/bench/csv/CsvScan.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.bench.csv; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.TypeDescription; +import org.apache.orc.bench.TaxiToOrc; + +public class CsvScan { + public static void main(String[] args) throws Exception { +Configuration conf = new Configuration(); +long rowCount = 0; +TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema"); +for(String filename: args) { + CsvReader reader = new CsvReader(new Path(filename), conf, schema); + VectorizedRowBatch batch = schema.createRowBatch(); + while (reader.nextBatch(batch)) { +rowCount += batch.size; + } +} +System.out.println("Rows read: " + rowCount); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/5b37113b/java/bench/src/java/org/apache/orc/bench/json/JsonReader.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/json/JsonReader.java b/java/bench/src/java/org/apache/orc/bench/json/JsonReader.java new file mode 100644 index 000..a5057e4 --- /dev/null +++ b/java/bench/src/java/org/apache/orc/bench/json/JsonReader.java @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.bench.json; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonStreamParser; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.TypeDescription; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.sql.Timestamp; +import java.util.List; +import java.util.zip.GZIPInputStream; + +public class JsonReader { + private final TypeDescription schema; + private final JsonStreamParser parser; + private final JsonConverter[] converters; + + interface JsonConverter { +void convert(JsonElement value, ColumnVector vect, int row); + } + + static class BooleanColumnConverter implements JsonConverter { +public void convert(JsonElement value, ColumnVec
[04/15] orc git commit: more updates
http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/LeafFilterFactory.java -- diff --git a/java/bench/src/java/org/apache/orc/bench/parquet/LeafFilterFactory.java b/java/bench/src/java/org/apache/orc/bench/parquet/LeafFilterFactory.java new file mode 100644 index 000..13a822a --- /dev/null +++ b/java/bench/src/java/org/apache/orc/bench/parquet/LeafFilterFactory.java @@ -0,0 +1,200 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.bench.parquet; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.Operator; + +import org.apache.parquet.filter2.predicate.FilterApi; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import static org.apache.parquet.filter2.predicate.FilterApi.eq; +import static org.apache.parquet.filter2.predicate.FilterApi.lt; +import static org.apache.parquet.filter2.predicate.FilterApi.ltEq; +import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.floatColumn; +import static org.apache.parquet.filter2.predicate.FilterApi.intColumn; + +public class LeafFilterFactory { + private static final Logger LOG = LoggerFactory.getLogger(LeafFilterFactory.class); + + class IntFilterPredicateLeafBuilder extends FilterPredicateLeafBuilder { +/** + * @param op consists of EQUALS, NULL_SAFE_EQUALS, LESS_THAN, LESS_THAN_EQUALS, IS_NULL + * @param literal + * @param columnName + * @return + */ +@Override +public FilterPredicate buildPredict(Operator op, Object literal, +String columnName) { + switch (op) { +case LESS_THAN: + return lt(intColumn(columnName), ((Number) literal).intValue()); +case IS_NULL: +case EQUALS: +case NULL_SAFE_EQUALS: + return eq(intColumn(columnName), +(literal == null) ? null : ((Number) literal).intValue()); +case LESS_THAN_EQUALS: + return ltEq(intColumn(columnName), ((Number) literal).intValue()); +default: + throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op); + } +} + } + + class LongFilterPredicateLeafBuilder extends FilterPredicateLeafBuilder { +@Override +public FilterPredicate buildPredict(Operator op, Object constant, +String columnName) { + switch (op) { +case LESS_THAN: + return lt(FilterApi.longColumn(columnName), ((Number) constant).longValue()); +case IS_NULL: +case EQUALS: +case NULL_SAFE_EQUALS: + return eq(FilterApi.longColumn(columnName), +(constant == null) ? null : ((Number) constant).longValue()); +case LESS_THAN_EQUALS: + return ltEq(FilterApi.longColumn(columnName), +((Number) constant).longValue()); +default: + throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op); + } +} + } + + class FloatFilterPredicateLeafBuilder extends FilterPredicateLeafBuilder { +@Override +public FilterPredicate buildPredict(Operator op, Object constant, String columnName) { + switch (op) { + case LESS_THAN: +return lt(floatColumn(columnName), ((Number) constant).floatValue()); + case IS_NULL: + case EQUALS: + case NULL_SAFE_EQUALS: +return eq(floatColumn(columnName), +(constant == null) ? null : ((Number) constant).floatValue()); + case LESS_THAN_EQUALS: +return ltEq(FilterApi.floatColumn(columnName), ((Number) constant).floatValue()); + default: +throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op); + } +} + } + + class DoubleFilterPredicateLeafBuilder extends FilterPredicateLeafBuilder { + +@Override +public FilterPredicate buildPredict(Operator op, Object constant, +