more updates

Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/1752e172
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/1752e172
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/1752e172

Branch: refs/heads/orc-72
Commit: 1752e172a0f7b16fd193f75eb57d10ca05ab0b91
Parents: 5b37113
Author: Owen O'Malley <omal...@apache.org>
Authored: Wed Oct 5 16:04:16 2016 -0700
Committer: Owen O'Malley <omal...@apache.org>
Committed: Mon Oct 10 13:59:16 2016 -0700

----------------------------------------------------------------------
 java/bench/pom.xml                              |  13 +
 .../orc/bench/ColumnProjectionBenchmark.java    |  10 +-
 .../org/apache/orc/bench/CompressionKind.java   |  56 ++++
 .../java/org/apache/orc/bench/GithubToAvro.java |   4 +-
 .../java/org/apache/orc/bench/GithubToJson.java |  21 +-
 .../java/org/apache/orc/bench/GithubToOrc.java  |   6 +-
 .../org/apache/orc/bench/GithubToParquet.java   |   2 +-
 .../java/org/apache/orc/bench/SalesToJson.java  |  16 +-
 .../java/org/apache/orc/bench/SalesToOrc.java   |   2 +-
 .../java/org/apache/orc/bench/TaxiToJson.java   |  64 +---
 .../java/org/apache/orc/bench/TaxiToOrc.java    |  63 +---
 .../java/org/apache/orc/bench/Utilities.java    |  86 ++++++
 .../org/apache/orc/bench/avro/AvroWriter.java   |  23 --
 .../org/apache/orc/bench/json/JsonWriter.java   |  69 +++++
 .../orc/bench/parquet/ConverterParent.java      |  24 ++
 .../bench/parquet/DataWritableReadSupport.java  | 200 +++++++++++++
 .../parquet/DataWritableRecordConverter.java    |  49 ++++
 .../bench/parquet/DataWritableWriteSupport.java |  61 ++++
 .../orc/bench/parquet/ETypeConverter.java       | 292 +++++++++++++++++++
 .../parquet/FilterPredicateLeafBuilder.java     |  80 +++++
 .../bench/parquet/HiveCollectionConverter.java  | 196 +++++++++++++
 .../orc/bench/parquet/HiveGroupConverter.java   |  79 +++++
 .../orc/bench/parquet/HiveSchemaConverter.java  | 140 +++++++++
 .../orc/bench/parquet/HiveStructConverter.java  | 192 ++++++++++++
 .../orc/bench/parquet/LeafFilterFactory.java    | 200 +++++++++++++
 .../parquet/MapredParquetOutputFormat.java      | 129 ++++++++
 .../org/apache/orc/bench/parquet/NanoTime.java  |  68 +++++
 .../apache/orc/bench/parquet/NanoTimeUtils.java | 113 +++++++
 .../ParquetFilterPredicateConverter.java        | 143 +++++++++
 .../parquet/ParquetRecordReaderWrapper.java     | 276 ++++++++++++++++++
 .../org/apache/orc/bench/parquet/Repeated.java  | 193 ++++++++++++
 java/pom.xml                                    |   5 +
 .../java/org/apache/orc/tools/PrintData.java    |   8 +-
 33 files changed, 2699 insertions(+), 184 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/pom.xml
----------------------------------------------------------------------
diff --git a/java/bench/pom.xml b/java/bench/pom.xml
index f40f21b..caee888 100644
--- a/java/bench/pom.xml
+++ b/java/bench/pom.xml
@@ -37,6 +37,10 @@
     </dependency>
     <dependency>
       <groupId>org.apache.orc</groupId>
+      <artifactId>orc-mapreduce</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
       <artifactId>orc-tools</artifactId>
     </dependency>
 
@@ -71,6 +75,10 @@
       <artifactId>parquet-hadoop</artifactId>
     </dependency>
     <dependency>
+      <groupId>org.jodd</groupId>
+      <artifactId>jodd-core</artifactId>
+    </dependency>
+    <dependency>
       <groupId>org.openjdk.jmh</groupId>
       <artifactId>jmh-core</artifactId>
     </dependency>
@@ -89,6 +97,11 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-mapreduce</artifactId>
+      <version>1.3.0-SNAPSHOT</version>
+    </dependency>
   </dependencies>
 
   <build>

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java 
b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
index 4b17819..c53911f 100644
--- a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
+++ b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
@@ -29,8 +29,6 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.TrackingLocalFileSystem;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
-import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
 import org.apache.hadoop.io.ArrayWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapred.FileSplit;
@@ -40,6 +38,8 @@ import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.parquet.DataWritableReadSupport;
+import org.apache.orc.bench.parquet.ParquetRecordReaderWrapper;
 import org.apache.parquet.hadoop.ParquetInputFormat;
 import org.openjdk.jmh.annotations.AuxCounters;
 import org.openjdk.jmh.annotations.Benchmark;
@@ -58,12 +58,9 @@ import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.net.URI;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
-import java.util.zip.GZIPInputStream;
 
 @BenchmarkMode(Mode.AverageTime)
 @Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS)
@@ -171,8 +168,7 @@ public class ColumnProjectionBenchmark {
     NullWritable nada = NullWritable.get();
     FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{});
     org.apache.hadoop.mapred.RecordReader<NullWritable,ArrayWritable> 
recordReader =
-        new ParquetRecordReaderWrapper(inputFormat, split, conf,
-            Reporter.NULL);
+        new ParquetRecordReaderWrapper(inputFormat, split, conf);
     ArrayWritable value = recordReader.createValue();
     while (recordReader.next(nada, value)) {
       counters.records += 1;

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java 
b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
new file mode 100644
index 0000000..9fe9ba9
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import io.airlift.compress.snappy.SnappyCodec;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Created by owen on 10/5/16.
+ */
+public enum CompressionKind {
+  NONE(""),
+  ZLIB(".gz"),
+  SNAPPY(".snappy");
+
+  CompressionKind(String extendsion) {
+    this.extension = extendsion;
+  }
+
+  private final String extension;
+
+  public String getExtension() {
+    return extension;
+  }
+
+  public OutputStream create(OutputStream out) throws IOException {
+    switch (this) {
+      case NONE:
+        return out;
+      case ZLIB:
+        return new GZIPOutputStream(out);
+      case SNAPPY:
+        return new SnappyCodec().createOutputStream(out);
+      default:
+        throw new IllegalArgumentException("Unhandled kind " + this);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java 
b/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java
index ee882e7..eb94ff2 100644
--- a/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java
+++ b/java/bench/src/java/org/apache/orc/bench/GithubToAvro.java
@@ -28,12 +28,12 @@ import org.apache.orc.bench.json.JsonReader;
 public class GithubToAvro {
 
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = TaxiToOrc.loadSchema("github.schema");
+    TypeDescription schema = Utilities.loadSchema("github.schema");
     Configuration conf = new Configuration();
     AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf,
         TaxiToAvro.getCodec(args[1]));
     VectorizedRowBatch batch = schema.createRowBatch();
-    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
+    for(String inFile: Utilities.sliceArray(args, 2)) {
       JsonReader reader = new JsonReader(new Path(inFile), conf, schema);
       while (reader.nextBatch(batch)) {
         writer.writeBatch(batch);

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/GithubToJson.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToJson.java 
b/java/bench/src/java/org/apache/orc/bench/GithubToJson.java
index 1dd23de..cf6ca33 100644
--- a/java/bench/src/java/org/apache/orc/bench/GithubToJson.java
+++ b/java/bench/src/java/org/apache/orc/bench/GithubToJson.java
@@ -23,29 +23,22 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.bench.json.JsonReader;
-import org.apache.orc.tools.FileDump;
-
-import java.io.OutputStreamWriter;
-import java.io.Writer;
+import org.apache.orc.bench.json.JsonWriter;
 
 public class GithubToJson {
 
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = TaxiToOrc.loadSchema("github.schema");
+    TypeDescription schema = Utilities.loadSchema("github.schema");
     Path path = new Path(args[0]);
     VectorizedRowBatch batch = schema.createRowBatch();
     Configuration conf = new Configuration();
-    Writer output = new OutputStreamWriter(TaxiToJson.getCodec(args[1])
-        .create(path.getFileSystem(conf).create(path)));
-    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
-      JsonReader reader = new JsonReader(new Path(inFile), conf, schema);
-      while (reader.nextBatch(batch)) {
-        for(int r=0; r < batch.size; ++r) {
-          FileDump.printRow(output, batch, schema, r);
-          output.write("\n");
+    try (JsonWriter writer = new JsonWriter(path, schema, conf, args[1])) {
+      for (String inFile : Utilities.sliceArray(args, 2)) {
+        JsonReader reader = new JsonReader(new Path(inFile), conf, schema);
+        while (reader.nextBatch(batch)) {
+          writer.writeBatch(batch);
         }
       }
     }
-    output.close();
   }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java 
b/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java
index ebd6443..a04b08e 100644
--- a/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java
+++ b/java/bench/src/java/org/apache/orc/bench/GithubToOrc.java
@@ -29,13 +29,13 @@ import org.apache.orc.bench.json.JsonReader;
 public class GithubToOrc {
 
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = TaxiToOrc.loadSchema("github.schema");
+    TypeDescription schema = Utilities.loadSchema("github.schema");
     VectorizedRowBatch batch = schema.createRowBatch();
     Configuration conf = new Configuration();
     Writer writer = OrcFile.createWriter(new Path(args[0]),
         OrcFile.writerOptions(conf).setSchema(schema)
-            .compress(TaxiToOrc.getCodec(args[1])));
-    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
+            .compress(Utilities.getCodec(args[1])));
+    for(String inFile: Utilities.sliceArray(args, 2)) {
       JsonReader reader = new JsonReader(new Path(inFile), conf, schema);
       while (reader.nextBatch(batch)) {
         writer.addRowBatch(batch);

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/GithubToParquet.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/GithubToParquet.java 
b/java/bench/src/java/org/apache/orc/bench/GithubToParquet.java
index db88c52..b1678aa 100644
--- a/java/bench/src/java/org/apache/orc/bench/GithubToParquet.java
+++ b/java/bench/src/java/org/apache/orc/bench/GithubToParquet.java
@@ -36,7 +36,7 @@ import java.util.Properties;
 public class GithubToParquet {
 
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = TaxiToOrc.loadSchema("github.schema");
+    TypeDescription schema = Utilities.loadSchema("github.schema");
     VectorizedRowBatch batch = schema.createRowBatch();
     JobConf conf = new JobConf();
     conf.set("mapred.task.id", "attempt_0_0_m_0_0");

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToJson.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
index 500b6c9..3d51cc0 100644
--- a/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
@@ -22,11 +22,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.tools.FileDump;
-
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.zip.GZIPOutputStream;
+import org.apache.orc.bench.json.JsonWriter;
 
 public class SalesToJson {
 
@@ -36,14 +32,10 @@ public class SalesToJson {
     Path path = new Path(args[0]);
     VectorizedRowBatch batch = schema.createRowBatch();
     Configuration conf = new Configuration();
-    Writer output = new OutputStreamWriter(TaxiToJson.getCodec(args[1])
-        .create(path.getFileSystem(conf).create(path)));
-    while (sales.nextBatch(batch)) {
-      for(int r=0; r < batch.size; ++r) {
-        FileDump.printRow(output, batch, schema, r);
-        output.write("\n");
+    try (JsonWriter output = new JsonWriter(path, schema, conf, args[1])) {
+      while (sales.nextBatch(batch)) {
+        output.writeBatch(batch);
       }
     }
-    output.close();
   }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
index 4e715b2..d3b2615 100644
--- a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
@@ -33,7 +33,7 @@ public class SalesToOrc {
     Writer writer = OrcFile.createWriter(new Path(args[0]),
         OrcFile.writerOptions(conf)
             .setSchema(sales.getSchema())
-            .compress(TaxiToOrc.getCodec(args[1])));
+            .compress(Utilities.getCodec(args[1])));
     while (sales.nextBatch(batch)) {
       writer.addRowBatch(batch);
     }

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
index 4b8ca8c..8963230 100644
--- a/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
@@ -23,72 +23,22 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.bench.csv.CsvReader;
-import org.apache.orc.tools.FileDump;
-import org.iq80.snappy.SnappyOutputStream;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.zip.GZIPOutputStream;
+import org.apache.orc.bench.json.JsonWriter;
 
 public class TaxiToJson {
 
-  public enum CompressionKind {
-    NONE(""),
-    ZLIB(".gz"),
-    SNAPPY(".snappy");
-
-    CompressionKind(String extendsion) {
-      this.extension = extendsion;
-    }
-
-    private final String extension;
-
-    public String getExtension() {
-      return extension;
-    }
-
-    public OutputStream create(OutputStream out) throws IOException {
-      switch (this) {
-        case NONE:
-          return out;
-        case ZLIB:
-          return new GZIPOutputStream(out);
-        case SNAPPY:
-          return new SnappyOutputStream(out);
-        default:
-          throw new IllegalArgumentException("Unhandled kind " + this);
-      }
-    }
-  }
-
-  public static CompressionKind getCodec(String name) {
-    if ("none".equals(name)) {
-      return CompressionKind.NONE;
-    } else if ("zlib".equals(name)) {
-      return CompressionKind.ZLIB;
-    } else if ("snappy".equals(name)) {
-      return CompressionKind.SNAPPY;
-    } throw new IllegalArgumentException("Unhnadled kind " + name);
-  }
-
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema");
+    TypeDescription schema = Utilities.loadSchema("nyc-taxi.schema");
     Path path = new Path(args[0]);
     VectorizedRowBatch batch = schema.createRowBatch();
     Configuration conf = new Configuration();
-    Writer output = new OutputStreamWriter(getCodec(args[1])
-        .create(path.getFileSystem(conf).create(path)));
-    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
-      CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
-      while (reader.nextBatch(batch)) {
-        for(int r=0; r < batch.size; ++r) {
-          FileDump.printRow(output, batch, schema, r);
-          output.write("\n");
+    try (JsonWriter output = new JsonWriter(path, schema, conf, args[1])) {
+      for(String inFile: Utilities.sliceArray(args, 2)) {
+        CsvReader taxi = new CsvReader(new Path(inFile), conf, schema);
+        while (taxi.nextBatch(batch)) {
+          output.writeBatch(batch);
         }
       }
     }
-    output.close();
   }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
index 2588c72..e95f794 100644
--- a/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
@@ -33,72 +33,15 @@ import java.util.Iterator;
 
 public class TaxiToOrc {
 
-  public static TypeDescription loadSchema(String name) throws IOException {
-    InputStream in = 
TaxiToOrc.class.getClassLoader().getResourceAsStream(name);
-    byte[] buffer= new byte[1 * 1024];
-    int len = in.read(buffer);
-    StringBuilder string = new StringBuilder();
-    while (len > 0) {
-      for(int i=0; i < len; ++i) {
-        // strip out
-        if (buffer[i] != '\n' && buffer[i] != ' ') {
-          string.append((char) buffer[i]);
-        }
-      }
-      len = in.read(buffer);
-    }
-    return TypeDescription.fromString(string.toString());
-  }
-
-  public static CompressionKind getCodec(String compression) {
-    if ("none".equals(compression)) {
-      return CompressionKind.NONE;
-    } else if ("zlib".equals(compression)) {
-      return CompressionKind.ZLIB;
-    } else if ("snappy".equals(compression)) {
-      return CompressionKind.SNAPPY;
-    } else {
-      throw new IllegalArgumentException("Unknown compression " + compression);
-    }
-  }
-
-  public static Iterable<String> sliceArray(final String[] array,
-                                            final int start) {
-    return new Iterable<String>() {
-      String[] values = array;
-      int posn = start;
-
-      @Override
-      public Iterator<String> iterator() {
-        return new Iterator<String>() {
-          @Override
-          public boolean hasNext() {
-            return posn < values.length;
-          }
-
-          @Override
-          public String next() {
-            return values[posn++];
-          }
-
-          @Override
-          public void remove() {
-            throw new UnsupportedOperationException("No remove");
-          }
-        };
-      }
-    };
-  }
-
   public static void main(String[] args) throws Exception {
-    TypeDescription schema = loadSchema("nyc-taxi.schema");
+    TypeDescription schema = Utilities.loadSchema("nyc-taxi.schema");
     VectorizedRowBatch batch = schema.createRowBatch();
     Configuration conf = new Configuration();
     Writer writer = OrcFile.createWriter(new Path(args[0]),
         OrcFile.writerOptions(conf)
             .setSchema(schema)
-            .compress(getCodec(args[1])));
-    for(String inFile: sliceArray(args, 2)) {
+            .compress(Utilities.getCodec(args[1])));
+    for(String inFile: Utilities.sliceArray(args, 2)) {
       CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
       while (reader.nextBatch(batch)) {
         writer.addRowBatch(batch);

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/Utilities.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/Utilities.java 
b/java/bench/src/java/org/apache/orc/bench/Utilities.java
new file mode 100644
index 0000000..9a95ae9
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/Utilities.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.orc.CompressionKind;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+public class Utilities {
+
+  public static TypeDescription loadSchema(String name) throws IOException {
+    InputStream in = 
Utilities.class.getClassLoader().getResourceAsStream(name);
+    byte[] buffer= new byte[1 * 1024];
+    int len = in.read(buffer);
+    StringBuilder string = new StringBuilder();
+    while (len > 0) {
+      for(int i=0; i < len; ++i) {
+        // strip out
+        if (buffer[i] != '\n' && buffer[i] != ' ') {
+          string.append((char) buffer[i]);
+        }
+      }
+      len = in.read(buffer);
+    }
+    return TypeDescription.fromString(string.toString());
+  }
+
+  public static CompressionKind getCodec(String compression) {
+    if ("none".equals(compression)) {
+      return CompressionKind.NONE;
+    } else if ("zlib".equals(compression)) {
+      return CompressionKind.ZLIB;
+    } else if ("snappy".equals(compression)) {
+      return CompressionKind.SNAPPY;
+    } else {
+      throw new IllegalArgumentException("Unknown compression " + compression);
+    }
+  }
+
+  public static Iterable<String> sliceArray(final String[] array,
+                                            final int start) {
+    return new Iterable<String>() {
+      String[] values = array;
+      int posn = start;
+
+      @Override
+      public Iterator<String> iterator() {
+        return new Iterator<String>() {
+          @Override
+          public boolean hasNext() {
+            return posn < values.length;
+          }
+
+          @Override
+          public String next() {
+            return values[posn++];
+          }
+
+          @Override
+          public void remove() {
+            throw new UnsupportedOperationException("No remove");
+          }
+        };
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java 
b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java
index f9d3bad..2735a71 100644
--- a/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java
+++ b/java/bench/src/java/org/apache/orc/bench/avro/AvroWriter.java
@@ -46,29 +46,6 @@ import java.util.Properties;
 
 public class AvroWriter {
 
-  static Properties setHiveSchema(TypeDescription schema) {
-    if (schema.getCategory() != TypeDescription.Category.STRUCT) {
-      throw new IllegalArgumentException("Assumes struct type as root, not " +
-          schema);
-    }
-    StringBuilder fieldNames = new StringBuilder();
-    StringBuilder fieldTypes = new StringBuilder();
-    List<String> childNames = schema.getFieldNames();
-    List<TypeDescription> childTypes = schema.getChildren();
-    for(int f=0; f < childNames.size(); ++f) {
-      if (f != 0) {
-        fieldNames.append(',');
-        fieldTypes.append(',');
-      }
-      fieldNames.append(childNames.get(f));
-      fieldTypes.append(childTypes.get(f).toString());
-    }
-    Properties properties = new Properties();
-    properties.put("columns", fieldNames.toString());
-    properties.put("columns.types", fieldTypes.toString());
-    return properties;
-  }
-
   interface AvroConverter {
     Object convert(ColumnVector vector, int row);
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/json/JsonWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/json/JsonWriter.java 
b/java/bench/src/java/org/apache/orc/bench/json/JsonWriter.java
new file mode 100644
index 0000000..9f03197
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/json/JsonWriter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.json;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.tools.PrintData;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONWriter;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+public class JsonWriter implements Closeable {
+  private final Writer outStream;
+  private final JSONWriter writer;
+  private final TypeDescription schema;
+
+  public JsonWriter(Path path, TypeDescription schema,
+                    Configuration conf,
+                    String compression) throws IOException {
+    CompressionKind codec = CompressionKind.valueOf(compression);
+    path = path.suffix(".jsn" + codec.getExtension());
+    OutputStream file = path.getFileSystem(conf).create(path, true);
+    outStream = new OutputStreamWriter(codec.create(file));
+    writer = new JSONWriter(outStream);
+    this.schema = schema;
+  }
+
+  public void writeBatch(VectorizedRowBatch batch) throws IOException {
+    try {
+      for (int r = 0; r < batch.size; ++r) {
+        for (int f = 0; f < batch.cols.length; ++f) {
+          PrintData.printRow(writer, batch, schema, r);
+
+        }
+        outStream.append('\n');
+      }
+    } catch (JSONException je) {
+      throw new IOException("json problem", je);
+    }
+  }
+
+  public void close() throws IOException {
+    outStream.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/ConverterParent.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/ConverterParent.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/ConverterParent.java
new file mode 100644
index 0000000..afd6b76
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/parquet/ConverterParent.java
@@ -0,0 +1,24 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import org.apache.hadoop.io.Writable;
+
+import java.util.Map;
+
+interface ConverterParent {
+  void set(int index, Writable value);
+
+  Map<String, String> getMetadata();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableReadSupport.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableReadSupport.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableReadSupport.java
new file mode 100644
index 0000000..0bcce1f
--- /dev/null
+++ 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableReadSupport.java
@@ -0,0 +1,200 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.hadoop.api.InitContext;
+import org.apache.parquet.hadoop.api.ReadSupport;
+import org.apache.parquet.io.api.RecordMaterializer;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.schema.Type.Repetition;
+import org.apache.parquet.schema.Types;
+
+/**
+ *
+ * A MapWritableReadSupport
+ *
+ * Manages the translation between Hive and Parquet
+ *
+ */
+public class DataWritableReadSupport extends ReadSupport<ArrayWritable> {
+
+  public static final String HIVE_TABLE_AS_PARQUET_SCHEMA = 
"HIVE_TABLE_SCHEMA";
+  public static final String PARQUET_COLUMN_INDEX_ACCESS = 
"parquet.column.index.access";
+  private TypeDescription hiveTypeInfo;
+
+  /**
+   * Searchs for a fieldName into a parquet GroupType by ignoring string case.
+   * GroupType#getType(String fieldName) is case sensitive, so we use this 
method.
+   *
+   * @param groupType Group of field types where to search for fieldName
+   * @param fieldName The field what we are searching
+   * @return The Type object of the field found; null otherwise.
+   */
+  private static Type getFieldTypeIgnoreCase(GroupType groupType, String 
fieldName) {
+    for (Type type : groupType.getFields()) {
+      if (type.getName().equalsIgnoreCase(fieldName)) {
+        return type;
+      }
+    }
+
+    return null;
+  }
+
+  /**
+   * Searchs column names by name on a given Parquet schema, and returns its 
corresponded
+   * Parquet schema types.
+   *
+   * @param schema Group schema where to search for column names.
+   * @param colTypes List of column types.
+   * @return List of GroupType objects of projected columns.
+   */
+  private static List<Type> getProjectedGroupFields(GroupType schema,
+                                                    TypeDescription colTypes) {
+    List<Type> schemaTypes = new ArrayList<Type>();
+    List<String> fieldNames = colTypes.getFieldNames();
+    List<TypeDescription> fieldTypes = colTypes.getChildren();
+
+    for(int i=0; i < fieldNames.size(); ++i) {
+      TypeDescription colType = fieldTypes.get(i);
+      String colName = fieldNames.get(i);
+
+      Type fieldType = getFieldTypeIgnoreCase(schema, colName);
+      if (fieldType == null) {
+        
schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName));
+      } else {
+        schemaTypes.add(getProjectedType(colType, fieldType));
+      }
+    }
+
+    return schemaTypes;
+  }
+
+  private static Type getProjectedType(TypeDescription colType, Type 
fieldType) {
+    switch (colType.getCategory()) {
+      case STRUCT:
+        List<Type> groupFields = getProjectedGroupFields(
+          fieldType.asGroupType(),
+          colType
+        );
+  
+        Type[] typesArray = groupFields.toArray(new Type[0]);
+        return Types.buildGroup(fieldType.getRepetition())
+          .addFields(typesArray)
+          .named(fieldType.getName());
+      case LIST:
+        TypeDescription elemType = colType.getChildren().get(0);
+        if (elemType.getCategory() == TypeDescription.Category.STRUCT) {
+          Type subFieldType = fieldType.asGroupType().getType(0);
+          if (!subFieldType.isPrimitive()) {
+            String subFieldName = subFieldType.getName();
+            Text name = new Text(subFieldName);
+            if (name.equals("array") || name.equals("list")) {
+              subFieldType = new GroupType(Repetition.REPEATED, subFieldName,
+                getProjectedType(elemType, 
subFieldType.asGroupType().getType(0)));
+            } else {
+              subFieldType = getProjectedType(elemType, subFieldType);
+            }
+            return 
Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addFields(
+              subFieldType).named(fieldType.getName());
+          }
+        }
+        break;
+      default:
+    }
+    return fieldType;
+  }
+
+  /**
+   * Searchs column names by name on a given Parquet message schema, and 
returns its projected
+   * Parquet schema types.
+   *
+   * @param schema Message type schema where to search for column names.
+   * @param colTypes The reader schema
+   * @return A MessageType object of projected columns.
+   */
+  private static MessageType getSchemaByName(MessageType schema,
+                                             TypeDescription colTypes) {
+    List<Type> projectedFields = getProjectedGroupFields(schema, colTypes);
+    Type[] typesArray = projectedFields.toArray(new Type[0]);
+
+    return Types.buildMessage()
+        .addFields(typesArray)
+        .named(schema.getName());
+  }
+
+  /**
+   * It creates the readContext for Parquet side with the requested schema 
during the init phase.
+   *
+   * @param context
+   * @return the parquet ReadContext
+   */
+  @Override
+  public org.apache.parquet.hadoop.api.ReadSupport.ReadContext 
init(InitContext context) {
+    Configuration configuration = context.getConfiguration();
+    MessageType fileSchema = context.getFileSchema();
+    Map<String, String> contextMetadata = new HashMap<String, String>();
+    String readerSchemaString = configuration.get("reader.schema");
+
+    if (readerSchemaString != null) {
+      this.hiveTypeInfo = TypeDescription.fromString(readerSchemaString);
+
+      MessageType tableSchema = getSchemaByName(fileSchema, hiveTypeInfo);
+
+      contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, 
tableSchema.toString());
+      contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(false));
+
+      return new ReadContext(tableSchema, contextMetadata);
+    } else {
+      contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, fileSchema.toString());
+      return new ReadContext(fileSchema, contextMetadata);
+    }
+  }
+
+  /**
+   *
+   * It creates the hive read support to interpret data from parquet to hive
+   *
+   * @param configuration // unused
+   * @param keyValueMetaData
+   * @param fileSchema // unused
+   * @param readContext containing the requested schema and the schema of the 
hive table
+   * @return Record Materialize for Hive
+   */
+  @Override
+  public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration 
configuration,
+      final Map<String, String> keyValueMetaData, final MessageType fileSchema,
+          final org.apache.parquet.hadoop.api.ReadSupport.ReadContext 
readContext) {
+    final Map<String, String> metadata = readContext.getReadSupportMetadata();
+    if (metadata == null) {
+      throw new IllegalStateException("ReadContext not initialized properly. " 
+
+        "Don't know the Hive Schema.");
+    }
+    return new DataWritableRecordConverter(readContext.getRequestedSchema(), 
metadata, hiveTypeInfo);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableRecordConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableRecordConverter.java
 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableRecordConverter.java
new file mode 100644
index 0000000..9b1f4e5
--- /dev/null
+++ 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableRecordConverter.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.io.api.GroupConverter;
+import org.apache.parquet.io.api.RecordMaterializer;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageTypeParser;
+
+import java.util.Map;
+
+/**
+ *
+ * A MapWritableReadSupport, encapsulates the tuples
+ *
+ */
+public class DataWritableRecordConverter extends 
RecordMaterializer<ArrayWritable> {
+
+  private final HiveStructConverter root;
+
+  public DataWritableRecordConverter(final GroupType requestedSchema, final 
Map<String, String> metadata, TypeDescription hiveTypeInfo) {
+    this.root = new HiveStructConverter(requestedSchema,
+      
MessageTypeParser.parseMessageType(metadata.get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)),
+        metadata, hiveTypeInfo);
+  }
+
+  @Override
+  public ArrayWritable getCurrentRecord() {
+    return root.getCurrentArray();
+  }
+
+  @Override
+  public GroupConverter getRootConverter() {
+    return root;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java
 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java
new file mode 100644
index 0000000..f4621e5
--- /dev/null
+++ 
b/java/bench/src/java/org/apache/orc/bench/parquet/DataWritableWriteSupport.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io.parquet.write;
+
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
+
+import org.apache.parquet.hadoop.api.WriteSupport;
+import org.apache.parquet.io.api.RecordConsumer;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.MessageTypeParser;
+
+/**
+ *
+ * DataWritableWriteSupport is a WriteSupport for the DataWritableWriter
+ *
+ */
+public class DataWritableWriteSupport extends WriteSupport<ParquetHiveRecord> {
+
+  public static final String PARQUET_HIVE_SCHEMA = "parquet.hive.schema";
+
+  private DataWritableWriter writer;
+  private MessageType schema;
+
+  public static void setSchema(final MessageType schema, final Configuration 
configuration) {
+    configuration.set(PARQUET_HIVE_SCHEMA, schema.toString());
+  }
+
+  public static MessageType getSchema(final Configuration configuration) {
+    return 
MessageTypeParser.parseMessageType(configuration.get(PARQUET_HIVE_SCHEMA));
+  }
+
+  @Override
+  public WriteContext init(final Configuration configuration) {
+    schema = getSchema(configuration);
+    return new WriteContext(schema, new HashMap<String, String>());
+  }
+
+  @Override
+  public void prepareForWrite(final RecordConsumer recordConsumer) {
+    writer = new DataWritableWriter(recordConsumer, schema);
+  }
+
+  @Override
+  public void write(final ParquetHiveRecord record) {
+    writer.write(record);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/ETypeConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/ETypeConverter.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/ETypeConverter.java
new file mode 100644
index 0000000..56dcfe7
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/parquet/ETypeConverter.java
@@ -0,0 +1,292 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import java.math.BigDecimal;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.orc.TypeDescription;
+import org.apache.orc.mapred.OrcTimestamp;
+import org.apache.parquet.column.Dictionary;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.io.api.PrimitiveConverter;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+
+/**
+ *
+ * ETypeConverter is an easy way to set the converter for the right type.
+ *
+ */
+public enum ETypeConverter {
+
+  EDOUBLE_CONVERTER(Double.TYPE) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new PrimitiveConverter() {
+        @Override
+        public void addDouble(final double value) {
+          parent.set(index, new DoubleWritable(value));
+        }
+      };
+    }
+  },
+  EBOOLEAN_CONVERTER(Boolean.TYPE) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new PrimitiveConverter() {
+        @Override
+        public void addBoolean(final boolean value) {
+          parent.set(index, new BooleanWritable(value));
+        }
+      };
+    }
+  },
+  EFLOAT_CONVERTER(Float.TYPE) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      if (hiveTypeInfo != null && hiveTypeInfo.getCategory() == 
TypeDescription.Category.DOUBLE) {
+        return new PrimitiveConverter() {
+          @Override
+          public void addFloat(final float value) {
+            parent.set(index, new DoubleWritable((double) value));
+          }
+        };
+      } else {
+        return new PrimitiveConverter() {
+          @Override
+          public void addFloat(final float value) {
+            parent.set(index, new FloatWritable(value));
+          }
+        };
+      }
+    }
+  },
+  EINT32_CONVERTER(Integer.TYPE) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index,
+        final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      if (hiveTypeInfo != null) {
+        switch (hiveTypeInfo.getCategory()) {
+          case LONG:
+          return new PrimitiveConverter() {
+            @Override
+            public void addInt(final int value) {
+              parent.set(index, new LongWritable((long) value));
+            }
+          };
+          case FLOAT:
+          return new PrimitiveConverter() {
+            @Override
+            public void addInt(final int value) {
+              parent.set(index, new FloatWritable((float) value));
+            }
+          };
+          case DOUBLE:
+          return new PrimitiveConverter() {
+            @Override
+            public void addInt(final int value) {
+              parent.set(index, new DoubleWritable((float) value));
+            }
+          };
+        }
+      }
+      return new PrimitiveConverter() {
+        @Override
+        public void addInt(final int value) {
+          parent.set(index, new IntWritable(value));
+        }
+      };
+    }
+  },
+  EINT64_CONVERTER(Long.TYPE) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      if(hiveTypeInfo != null) {
+        switch(hiveTypeInfo.getCategory()) {
+          case FLOAT:
+          return new PrimitiveConverter() {
+            @Override
+            public void addLong(final long value) {
+              parent.set(index, new FloatWritable(value));
+            }
+          };
+          case DOUBLE:
+          return new PrimitiveConverter() {
+            @Override
+            public void addLong(final long value) {
+              parent.set(index, new DoubleWritable(value));
+            }
+          };
+        }
+      }
+      return new PrimitiveConverter() {
+        @Override
+        public void addLong(final long value) {
+          parent.set(index, new LongWritable(value));
+        }
+      };
+    }
+  },
+  EBINARY_CONVERTER(Binary.class) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new BinaryConverter<BytesWritable>(type, parent, index) {
+        @Override
+        protected BytesWritable convert(Binary binary) {
+          return new BytesWritable(binary.getBytes());
+        }
+      };
+    }
+  },
+  ESTRING_CONVERTER(String.class) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new BinaryConverter<Text>(type, parent, index) {
+        @Override
+        protected Text convert(Binary binary) {
+          return new Text(binary.getBytes());
+        }
+      };
+    }
+  },
+  EDECIMAL_CONVERTER(BigDecimal.class) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new BinaryConverter<HiveDecimalWritable>(type, parent, index) {
+        @Override
+        protected HiveDecimalWritable convert(Binary binary) {
+          return new HiveDecimalWritable(binary.getBytes(), 
type.getDecimalMetadata().getScale());
+        }
+      };
+    }
+  },
+  ETIMESTAMP_CONVERTER(OrcTimestamp.class) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new BinaryConverter<OrcTimestamp>(type, parent, index) {
+        @Override
+        protected OrcTimestamp convert(Binary binary) {
+          NanoTime nt = NanoTime.fromBinary(binary);
+          Map<String, String> metadata = parent.getMetadata();
+          //Current Hive parquet timestamp implementation stores it in UTC, 
but other components do not do that.
+          //If this file written by current Hive implementation itself, we 
need to do the reverse conversion, else skip the conversion.
+          boolean skipConversion = true;
+          Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipConversion);
+          return new OrcTimestamp(ts.getTime());
+        }
+      };
+    }
+  },
+  EDATE_CONVERTER(DateWritable.class) {
+    @Override
+    PrimitiveConverter getConverter(final PrimitiveType type, final int index, 
final ConverterParent parent, TypeDescription hiveTypeInfo) {
+      return new PrimitiveConverter() {
+        @Override
+        public void addInt(final int value) {
+          parent.set(index, new DateWritable(value));
+        }
+      };
+    }
+  };
+
+  final Class<?> _type;
+
+  private ETypeConverter(final Class<?> type) {
+    this._type = type;
+  }
+
+  private Class<?> getType() {
+    return _type;
+  }
+
+  abstract PrimitiveConverter getConverter(final PrimitiveType type, final int 
index, final ConverterParent parent, TypeDescription hiveTypeInfo);
+
+  public static PrimitiveConverter getNewConverter(final PrimitiveType type, 
final int index,
+                                                   final ConverterParent 
parent, TypeDescription hiveTypeInfo) {
+    if (type.isPrimitive() && 
(type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)))
 {
+      //TODO- cleanup once parquet support Timestamp type annotation.
+      return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, 
parent, hiveTypeInfo);
+    }
+    if (OriginalType.DECIMAL == type.getOriginalType()) {
+      return EDECIMAL_CONVERTER.getConverter(type, index, parent, 
hiveTypeInfo);
+    } else if (OriginalType.UTF8 == type.getOriginalType()) {
+      return ESTRING_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
+    } else if (OriginalType.DATE == type.getOriginalType()) {
+      return EDATE_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
+    }
+
+    Class<?> javaType = type.getPrimitiveTypeName().javaType;
+    for (final ETypeConverter eConverter : values()) {
+      if (eConverter.getType() == javaType) {
+        return eConverter.getConverter(type, index, parent, hiveTypeInfo);
+      }
+    }
+
+    throw new IllegalArgumentException("Converter not found ... for type : " + 
type);
+  }
+
+  public abstract static class BinaryConverter<T extends Writable> extends 
PrimitiveConverter {
+    protected final PrimitiveType type;
+    private final ConverterParent parent;
+    private final int index;
+    private ArrayList<T> lookupTable;
+
+    public BinaryConverter(PrimitiveType type, ConverterParent parent, int 
index) {
+      this.type = type;
+      this.parent = parent;
+      this.index = index;
+    }
+
+    protected abstract T convert(Binary binary);
+
+    @Override
+    public boolean hasDictionarySupport() {
+      return true;
+    }
+
+    @Override
+    public void setDictionary(Dictionary dictionary) {
+      int length = dictionary.getMaxId() + 1;
+      lookupTable = new ArrayList<T>();
+      for (int i = 0; i < length; i++) {
+        lookupTable.add(convert(dictionary.decodeToBinary(i)));
+      }
+    }
+
+    @Override
+    public void addValueFromDictionary(int dictionaryId) {
+      parent.set(index, lookupTable.get(dictionaryId));
+    }
+
+    @Override
+    public void addBinary(Binary value) {
+      parent.set(index, convert(value));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/FilterPredicateLeafBuilder.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/FilterPredicateLeafBuilder.java
 
b/java/bench/src/java/org/apache/orc/bench/parquet/FilterPredicateLeafBuilder.java
new file mode 100644
index 0000000..958c439
--- /dev/null
+++ 
b/java/bench/src/java/org/apache/orc/bench/parquet/FilterPredicateLeafBuilder.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.parquet.filter2.predicate.FilterApi;
+import org.apache.parquet.filter2.predicate.FilterPredicate;
+
+import static org.apache.parquet.filter2.predicate.FilterApi.not;
+import static org.apache.parquet.filter2.predicate.FilterApi.or;
+
+/**
+ * The base class for building parquet supported filter predicate in primary 
types.
+ */
+public abstract class FilterPredicateLeafBuilder {
+
+  /**
+   * Build filter predicate with multiple constants
+   *
+   * @param op         IN or BETWEEN
+   * @param literals
+   * @param columnName
+   * @return
+   */
+  public FilterPredicate buildPredicate(PredicateLeaf.Operator op, 
List<Object> literals,
+                                        String columnName) throws Exception {
+    FilterPredicate result = null;
+    switch (op) {
+      case IN:
+        for (Object literal : literals) {
+          if (result == null) {
+            result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, 
columnName);
+          } else {
+            result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, 
literal,
+                columnName));
+          }
+        }
+        return result;
+      case BETWEEN:
+        if (literals.size() != 2) {
+          throw new RuntimeException(
+            "Not able to build 'between' operation filter with " + literals +
+              " which needs two literals");
+        }
+        Object min = literals.get(0);
+        Object max = literals.get(1);
+        FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN,
+            min, columnName));
+        FilterPredicate gt = 
buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName);
+        result = FilterApi.and(gt, lt);
+        return result;
+      default:
+        throw new RuntimeException("Unknown PredicateLeaf Operator type: " + 
op);
+    }
+  }
+
+  /**
+   * Build predicate with a single constant
+   *
+   * @param op         EQUALS, NULL_SAFE_EQUALS, LESS_THAN, LESS_THAN_EQUALS, 
IS_NULL
+   * @param constant
+   * @param columnName
+   * @return null or a FilterPredicate, null means no filter will be executed
+   */
+  public abstract FilterPredicate buildPredict(PredicateLeaf.Operator op, 
Object constant,
+                                               String columnName) throws 
Exception;
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/HiveCollectionConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/HiveCollectionConverter.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/HiveCollectionConverter.java
new file mode 100644
index 0000000..a8834aa
--- /dev/null
+++ 
b/java/bench/src/java/org/apache/orc/bench/parquet/HiveCollectionConverter.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.parquet;
+
+import com.google.common.base.Preconditions;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.io.api.Converter;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.Type;
+
+public class HiveCollectionConverter extends HiveGroupConverter {
+  private final GroupType collectionType;
+  private final ConverterParent parent;
+  private final int index;
+  private final Converter innerConverter;
+  private final List<Writable> list = new ArrayList<Writable>();
+
+  public static HiveGroupConverter forMap(GroupType mapType,
+                                          ConverterParent parent,
+                                          int index, TypeDescription 
hiveTypeInfo) {
+    return new HiveCollectionConverter(
+        mapType, parent, index, true /* its a map */, hiveTypeInfo );
+  }
+
+  public static HiveGroupConverter forList(GroupType listType,
+                                           ConverterParent parent,
+                                           int index, TypeDescription 
hiveTypeInfo) {
+    return new HiveCollectionConverter(
+      listType, parent, index, false /* nUnknown hive type infoot a map */, 
hiveTypeInfo);
+  }
+
+  private HiveCollectionConverter(GroupType collectionType,
+                                  ConverterParent parent,
+                                  int index, boolean isMap, TypeDescription 
hiveTypeInfo) {
+    setMetadata(parent.getMetadata());
+    this.collectionType = collectionType;
+    this.parent = parent;
+    this.index = index;
+    Type repeatedType = collectionType.getType(0);
+    if (isMap) {
+      this.innerConverter = new KeyValueConverter(
+          repeatedType.asGroupType(), this, hiveTypeInfo);
+    } else if (isElementType(repeatedType, collectionType.getName())) {
+      this.innerConverter = getConverterFromDescription(repeatedType, 0, this, 
extractListCompatibleType(hiveTypeInfo));
+    } else {
+      this.innerConverter = new ElementConverter(
+          repeatedType.asGroupType(), this,  
extractListCompatibleType(hiveTypeInfo));
+    }
+  }
+
+  private TypeDescription extractListCompatibleType(TypeDescription 
hiveTypeInfo) {
+    if (hiveTypeInfo !=  null && hiveTypeInfo.getCategory() == 
TypeDescription.Category.LIST) {
+      return hiveTypeInfo.getChildren().get(0);
+    } else {
+      return hiveTypeInfo; //to handle map can read list of struct data (i.e. 
list<struct<key, value>> --> map<key,
+      // value>)
+    }
+  }
+
+  @Override
+  public Converter getConverter(int fieldIndex) {
+    Preconditions.checkArgument(
+        fieldIndex == 0, "Invalid field index: " + fieldIndex);
+    return innerConverter;
+  }
+
+  @Override
+  public void start() {
+    list.clear();
+  }
+
+  @Override
+  public void end() {
+    parent.set(index, new ArrayWritable(
+        Writable.class, list.toArray(new Writable[0])));
+  }
+
+  @Override
+  public void set(int index, Writable value) {
+    list.add(value);
+  }
+
+  private static class KeyValueConverter extends HiveGroupConverter {
+    private final HiveGroupConverter parent;
+    private final Converter keyConverter;
+    private final Converter valueConverter;
+    private Writable[] keyValue = null;
+
+    public KeyValueConverter(GroupType keyValueType, HiveGroupConverter 
parent, TypeDescription hiveTypeInfo) {
+      setMetadata(parent.getMetadata());
+      this.parent = parent;
+      this.keyConverter = getConverterFromDescription(
+          keyValueType.getType(0), 0, this, hiveTypeInfo == null ? null : 
hiveTypeInfo.getChildren().get(0));
+      this.valueConverter = getConverterFromDescription(
+          keyValueType.getType(1), 1, this, hiveTypeInfo == null ? null : 
hiveTypeInfo.getChildren().get(1));
+    }
+
+    @Override
+    public void set(int fieldIndex, Writable value) {
+      keyValue[fieldIndex] = value;
+    }
+
+    @Override
+    public Converter getConverter(int fieldIndex) {
+      switch (fieldIndex) {
+        case 0:
+          return keyConverter;
+        case 1:
+          return valueConverter;
+        default:
+          throw new IllegalArgumentException(
+              "Invalid field index for map key-value: " + fieldIndex);
+      }
+    }
+
+    @Override
+    public void start() {
+      this.keyValue = new Writable[2];
+    }
+
+    @Override
+    public void end() {
+      parent.set(0, new ArrayWritable(Writable.class, keyValue));
+    }
+  }
+
+  private static class ElementConverter extends HiveGroupConverter {
+    private final HiveGroupConverter parent;
+    private final Converter elementConverter;
+    private Writable element = null;
+
+    public ElementConverter(GroupType repeatedType, HiveGroupConverter parent, 
TypeDescription hiveTypeInfo) {
+      setMetadata(parent.getMetadata());
+      this.parent = parent;
+      this.elementConverter = getConverterFromDescription(
+          repeatedType.getType(0), 0, this, hiveTypeInfo);
+    }
+
+    @Override
+    public void set(int index, Writable value) {
+      this.element = value;
+    }
+
+    @Override
+    public Converter getConverter(int i) {
+      return elementConverter;
+    }
+
+    @Override
+    public void start() {
+      this.element = null;
+    }
+
+    @Override
+    public void end() {
+      parent.set(0, element);
+    }
+  }
+
+  private static boolean isElementType(Type repeatedType, String parentName) {
+    if (repeatedType.isPrimitive() ||
+        (repeatedType.asGroupType().getFieldCount() != 1)) {
+      return true;
+    } else if (repeatedType.getName().equals("array")) {
+      return true; // existing avro data
+    } else if (repeatedType.getName().equals(parentName + "_tuple")) {
+      return true; // existing thrift data
+    }
+    // false for the following cases:
+    // * name is "list", which matches the spec
+    // * name is "bag", which indicates existing hive or pig data
+    // * ambiguous case, which should be assumed is 3-level according to spec
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/HiveGroupConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/HiveGroupConverter.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/HiveGroupConverter.java
new file mode 100644
index 0000000..d41fb7e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/parquet/HiveGroupConverter.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.io.api.Converter;
+import org.apache.parquet.io.api.GroupConverter;
+import org.apache.parquet.io.api.PrimitiveConverter;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+
+import java.util.Map;
+
+public abstract class HiveGroupConverter extends GroupConverter implements 
ConverterParent {
+
+  private Map<String, String> metadata;
+
+  public void setMetadata(Map<String, String> metadata) {
+    this.metadata = metadata;
+  }
+
+  public Map<String, String> getMetadata() {
+    return metadata;
+  }
+
+  protected static PrimitiveConverter 
getConverterFromDescription(PrimitiveType type, int index, ConverterParent
+      parent, TypeDescription hiveTypeInfo) {
+    if (type == null) {
+      return null;
+    }
+
+    return ETypeConverter.getNewConverter(type, index, parent, hiveTypeInfo);
+  }
+
+  protected static HiveGroupConverter getConverterFromDescription(GroupType 
type, int index, ConverterParent parent,
+                                                                  
TypeDescription hiveTypeInfo) {
+    if (type == null) {
+      return null;
+    }
+
+    OriginalType annotation = type.getOriginalType();
+    if (annotation == OriginalType.LIST) {
+      return HiveCollectionConverter.forList(type, parent, index, 
hiveTypeInfo);
+    } else if (annotation == OriginalType.MAP || annotation == 
OriginalType.MAP_KEY_VALUE) {
+      return HiveCollectionConverter.forMap(type, parent, index, hiveTypeInfo);
+    }
+
+    return new HiveStructConverter(type, parent, index, hiveTypeInfo);
+  }
+
+  protected static Converter getConverterFromDescription(Type type, int index, 
ConverterParent parent, TypeDescription hiveTypeInfo) {
+    if (type == null) {
+      return null;
+    }
+
+    if (type.isPrimitive()) {
+      return getConverterFromDescription(type.asPrimitiveType(), index, 
parent, hiveTypeInfo);
+    }
+
+    return getConverterFromDescription(type.asGroupType(), index, parent, 
hiveTypeInfo);
+  }
+
+  public abstract void set(int index, Writable value);
+
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/HiveSchemaConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/HiveSchemaConverter.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/HiveSchemaConverter.java
new file mode 100644
index 0000000..7243fb6
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/parquet/HiveSchemaConverter.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import java.util.List;
+
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.schema.ConversionPatterns;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.schema.Type.Repetition;
+import org.apache.parquet.schema.Types;
+
+public class HiveSchemaConverter {
+
+  // Map precision to the number bytes needed for binary conversion.
+  static final int PRECISION_TO_BYTE_COUNT[] = new int[38];
+  static {
+    for (int prec = 1; prec <= 38; prec++) {
+      // Estimated number of bytes needed.
+      PRECISION_TO_BYTE_COUNT[prec - 1] = (int)
+          Math.ceil((Math.log(Math.pow(10, prec) - 1) / Math.log(2) + 1) / 8);
+    }
+  }
+
+  public static MessageType convert(TypeDescription hiveType) {
+    final MessageType schema = new MessageType("hive_schema", 
convertTypes(hiveType.getFieldNames(), hiveType.getChildren()));
+    return schema;
+  }
+
+  private static Type[] convertTypes(final List<String> columnNames, final 
List<TypeDescription> columnTypes) {
+    if (columnNames.size() != columnTypes.size()) {
+      throw new IllegalStateException("Mismatched Hive columns and types. Hive 
columns names" +
+        " found : " + columnNames + " . And Hive types found : " + 
columnTypes);
+    }
+    final Type[] types = new Type[columnNames.size()];
+    for (int i = 0; i < columnNames.size(); ++i) {
+      types[i] = convertType(columnNames.get(i), columnTypes.get(i));
+    }
+    return types;
+  }
+
+  private static Type convertType(final String name, final TypeDescription 
typeInfo) {
+    return convertType(name, typeInfo, Repetition.OPTIONAL);
+  }
+
+  private static Type convertType(final String name,
+                                  final TypeDescription typeInfo,
+                                  final Repetition repetition) {
+    switch (typeInfo.getCategory()) {
+      case STRING:
+        return Types.primitive(PrimitiveTypeName.BINARY, repetition)
+            .as(OriginalType.UTF8)
+            .named(name);
+      case BYTE:
+        return Types.primitive(PrimitiveTypeName.INT32, 
repetition).named(name);
+      case SHORT:
+        return Types.primitive(PrimitiveTypeName.INT32, 
repetition).named(name);
+      case INT:
+        return Types.primitive(PrimitiveTypeName.INT32, 
repetition).named(name);
+      case LONG:
+        return Types.primitive(PrimitiveTypeName.INT64, 
repetition).named(name);
+      case FLOAT:
+        return Types.primitive(PrimitiveTypeName.FLOAT, 
repetition).named(name);
+      case DOUBLE:
+        return Types.primitive(PrimitiveTypeName.DOUBLE, 
repetition).named(name);
+      case BOOLEAN:
+        return Types.primitive(PrimitiveTypeName.BOOLEAN, 
repetition).named(name);
+      case TIMESTAMP:
+        return Types.primitive(PrimitiveTypeName.INT96, 
repetition).named(name);
+      case BINARY:
+        return Types.primitive(PrimitiveTypeName.BINARY, 
repetition).named(name);
+      case CHAR:
+        return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
+            .named(name);
+      case VARCHAR:
+        return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
+            .named(name);
+      case DECIMAL: {
+        int prec = typeInfo.getPrecision();
+        int scale = typeInfo.getScale();
+        int bytes = PRECISION_TO_BYTE_COUNT[prec - 1];
+        return 
Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes)
+            .as(OriginalType.DECIMAL).
+            scale(scale).precision(prec).named(name);
+      }
+      case DATE:
+        return Types.primitive(PrimitiveTypeName.INT32, repetition)
+            .as(OriginalType.DATE).named(name);
+      case LIST:
+        return convertArrayType(name, typeInfo);
+      case MAP:
+        return convertMapType(name, typeInfo);
+      case STRUCT:
+        return convertStructType(name, typeInfo);
+      default:
+        throw new IllegalArgumentException("Unimplemented type " + typeInfo);
+
+    }
+  }
+
+  // An optional group containing a repeated anonymous group "bag", containing
+  // 1 anonymous element "array_element"
+  @SuppressWarnings("deprecation")
+  private static GroupType convertArrayType(final String name, final 
TypeDescription typeInfo) {
+    return new GroupType(Repetition.OPTIONAL, name, OriginalType.LIST,
+        new GroupType(Repetition.REPEATED,
+        "array", convertType("array_element", typeInfo.getChildren().get(0))));
+  }
+
+  // An optional group containing multiple elements
+  private static GroupType convertStructType(final String name, final 
TypeDescription typeInfo) {
+    return new GroupType(Repetition.OPTIONAL, name, 
convertTypes(typeInfo.getFieldNames(), typeInfo.getChildren()));
+
+  }
+
+  // An optional group containing a repeated anonymous group "map", containing
+  // 2 elements: "key", "value"
+  private static GroupType convertMapType(final String name, final 
TypeDescription typeInfo) {
+    final Type keyType = convertType("key",
+        typeInfo.getChildren().get(0), Repetition.REQUIRED);
+    final Type valueType = convertType("value",
+        typeInfo.getChildren().get(1));
+    return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, 
valueType);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/1752e172/java/bench/src/java/org/apache/orc/bench/parquet/HiveStructConverter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/src/java/org/apache/orc/bench/parquet/HiveStructConverter.java 
b/java/bench/src/java/org/apache/orc/bench/parquet/HiveStructConverter.java
new file mode 100644
index 0000000..250b4a2
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/parquet/HiveStructConverter.java
@@ -0,0 +1,192 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.parquet;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.io.api.Converter;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.Type;
+
+/**
+ *
+ * A MapWritableGroupConverter, real converter between hive and parquet types 
recursively for complex types.
+ *
+ */
+public class HiveStructConverter extends HiveGroupConverter {
+
+  private final int totalFieldCount;
+  private Converter[] converters;
+  private final ConverterParent parent;
+  private final int index;
+  private Writable[] writables;
+  private List<Repeated> repeatedConverters;
+  private boolean reuseWritableArray = false;
+  private List<String> hiveFieldNames;
+  private List<TypeDescription> hiveFieldTypeInfos;
+
+  public HiveStructConverter(final GroupType requestedSchema, final GroupType 
tableSchema,
+                             Map<String, String> metadata, TypeDescription 
hiveTypeInfo) {
+    setMetadata(metadata);
+    this.reuseWritableArray = true;
+    this.writables = new Writable[tableSchema.getFieldCount()];
+    this.parent = null;
+    this.index = 0;
+    this.totalFieldCount = tableSchema.getFieldCount();
+    init(requestedSchema, null, 0, tableSchema, hiveTypeInfo);
+  }
+
+  public HiveStructConverter(final GroupType groupType, final ConverterParent 
parent,
+                             final int index, TypeDescription hiveTypeInfo) {
+    this(groupType, parent, index, groupType, hiveTypeInfo);
+  }
+
+  public HiveStructConverter(final GroupType selectedGroupType,
+                             final ConverterParent parent, final int index, 
final GroupType containingGroupType, TypeDescription hiveTypeInfo) {
+    this.parent = parent;
+    this.index = index;
+    this.totalFieldCount = containingGroupType.getFieldCount();
+    init(selectedGroupType, parent, index, containingGroupType, hiveTypeInfo);
+  }
+
+  private void init(final GroupType selectedGroupType,
+                    final ConverterParent parent, final int index, final 
GroupType containingGroupType, TypeDescription hiveTypeInfo) {
+    if (parent != null) {
+      setMetadata(parent.getMetadata());
+    }
+    final int selectedFieldCount = selectedGroupType.getFieldCount();
+
+    converters = new Converter[selectedFieldCount];
+    this.repeatedConverters = new ArrayList<>();
+
+    if (hiveTypeInfo != null && hiveTypeInfo.getCategory() == 
TypeDescription.Category.STRUCT) {
+      this.hiveFieldNames = hiveTypeInfo.getFieldNames();
+      this.hiveFieldTypeInfos = hiveTypeInfo.getChildren();
+    }
+
+    List<Type> selectedFields = selectedGroupType.getFields();
+    for (int i = 0; i < selectedFieldCount; i++) {
+      Type subtype = selectedFields.get(i);
+      if (containingGroupType.getFields().contains(subtype)) {
+        int fieldIndex = containingGroupType.getFieldIndex(subtype.getName());
+        TypeDescription _hiveTypeInfo = getFieldTypeIgnoreCase(hiveTypeInfo, 
subtype.getName(), fieldIndex);
+        converters[i] = getFieldConverter(subtype, fieldIndex, _hiveTypeInfo);
+      } else {
+        throw new IllegalStateException("Group type [" + containingGroupType +
+            "] does not contain requested field: " + subtype);
+      }
+    }
+  }
+
+  private TypeDescription getFieldTypeIgnoreCase(TypeDescription hiveTypeInfo, 
String fieldName, int fieldIndex) {
+    if (hiveTypeInfo == null) {
+      return null;
+    } else if (hiveTypeInfo.getCategory() == TypeDescription.Category.STRUCT) {
+      return getStructFieldTypeInfo(fieldName, fieldIndex);
+    } else if (hiveTypeInfo.getCategory() == TypeDescription.Category.MAP) {
+      //This cover the case where hive table may have map<key, value> but the 
data file is
+      // of type array<struct<value1, value2>>
+      //Using index in place of type name.
+      if (fieldIndex == 0) {
+        return hiveTypeInfo.getChildren().get(0);
+      } else if (fieldIndex == 1) {
+        return hiveTypeInfo.getChildren().get(1);
+      } else {//Other fields are skipped for this case
+        return null;
+      }
+    }
+    throw new RuntimeException("Unknown hive type info " + hiveTypeInfo + " 
when searching for field " + fieldName);
+  }
+
+  private TypeDescription getStructFieldTypeInfo(String field, int fieldIndex) 
{
+    String fieldLowerCase = field.toLowerCase();
+    if 
(Boolean.parseBoolean(getMetadata().get(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS))
+        && fieldIndex < hiveFieldNames.size()) {
+      return hiveFieldTypeInfos.get(fieldIndex);
+    }
+    for (int i = 0; i < hiveFieldNames.size(); i++) {
+      if (fieldLowerCase.equalsIgnoreCase(hiveFieldNames.get(i))) {
+        return hiveFieldTypeInfos.get(i);
+      }
+    }
+    //This means hive type doesn't refer this field that comes from file 
schema.
+    //i.e. the field is not required for hive table. It can occur due to schema
+    //evolution where some field is deleted.
+    return null;
+  }
+
+  private Converter getFieldConverter(Type type, int fieldIndex, 
TypeDescription hiveTypeInfo) {
+    Converter converter;
+    if (type.isRepetition(Type.Repetition.REPEATED)) {
+      if (type.isPrimitive()) {
+        converter = new Repeated.RepeatedPrimitiveConverter(
+            type.asPrimitiveType(), this, fieldIndex, hiveTypeInfo);
+      } else {
+        converter = new Repeated.RepeatedGroupConverter(
+            type.asGroupType(), this, fieldIndex, hiveTypeInfo == null ? null 
: hiveTypeInfo.getChildren().get(0));
+      }
+
+      repeatedConverters.add((Repeated) converter);
+    } else {
+      converter = getConverterFromDescription(type, fieldIndex, this, 
hiveTypeInfo);
+    }
+
+    return converter;
+  }
+
+  public final ArrayWritable getCurrentArray() {
+    return new ArrayWritable(Writable.class, writables);
+  }
+
+  @Override
+  public void set(int fieldIndex, Writable value) {
+    writables[fieldIndex] = value;
+  }
+
+  @Override
+  public Converter getConverter(final int fieldIndex) {
+    return converters[fieldIndex];
+  }
+
+  @Override
+  public void start() {
+    if (reuseWritableArray) {
+      // reset the array to null values
+      for (int i = 0; i < writables.length; i += 1) {
+        writables[i] = null;
+      }
+    } else {
+      this.writables = new Writable[totalFieldCount];
+    }
+    for (Repeated repeated : repeatedConverters) {
+      repeated.parentStart();
+    }
+  }
+
+  @Override
+  public void end() {
+    for (Repeated repeated : repeatedConverters) {
+      repeated.parentEnd();
+    }
+    if (parent != null) {
+      parent.set(index, getCurrentArray());
+    }
+  }
+
+}

Reply via email to