http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java 
b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
new file mode 100644
index 0000000..fe8f85e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
@@ -0,0 +1,523 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+public class RandomGenerator {
+  private final TypeDescription schema = TypeDescription.createStruct();
+  private final List<Field> fields = new ArrayList<>();
+  private final Random random;
+
+  public RandomGenerator(int seed) {
+    random = new Random(seed);
+  }
+
+  private abstract class ValueGenerator {
+    double nullProbability = 0;
+    abstract void generate(ColumnVector vector, int valueCount);
+  }
+
+  private class RandomBoolean extends ValueGenerator {
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = random.nextInt(2);
+        }
+      }
+    }
+  }
+
+  private class RandomList extends ValueGenerator {
+    private final int minSize;
+    private final int sizeRange;
+    private final Field child;
+
+    public RandomList(int minSize, int maxSize, Field child) {
+      this.minSize = minSize;
+      this.sizeRange = maxSize - minSize + 1;
+      this.child = child;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      ListColumnVector vector = (ListColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.offsets[r] = vector.childCount;
+          vector.lengths[r] = random.nextInt(sizeRange) + minSize;
+          vector.childCount += vector.lengths[r];
+        }
+      }
+      vector.child.ensureSize(vector.childCount, false);
+      child.generator.generate(vector.child, vector.childCount);
+    }
+  }
+
+  private class RandomStruct extends ValueGenerator {
+    private final Field[] children;
+
+    public RandomStruct(Field[] children) {
+      this.children = children;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      StructColumnVector vector = (StructColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        }
+      }
+      for(int c=0; c < children.length; ++c) {
+        children[c].generator.generate(vector.fields[c], valueCount);
+      }
+    }
+  }
+
+  private abstract class IntegerGenerator extends ValueGenerator {
+    private final long sign;
+    private final long mask;
+
+    private IntegerGenerator(TypeDescription.Category kind) {
+      int bits = getIntegerLength(kind);
+      mask = bits == 64 ? 0 : -1L << bits;
+      sign = 1L << (bits - 1);
+    }
+
+    protected void normalize(LongColumnVector vector, int valueCount) {
+      // make sure the value stays in range by sign extending it
+      for(int r=0; r < valueCount; ++r) {
+        if ((vector.vector[r] & sign) == 0) {
+          vector.vector[r] &= ~mask;
+        } else {
+          vector.vector[r] |= mask;
+        }
+      }
+    }
+  }
+
+  private class AutoIncrement extends IntegerGenerator {
+    private long value;
+    private final long increment;
+
+    private AutoIncrement(TypeDescription.Category kind, long start,
+                          long increment) {
+      super(kind);
+      this.value = start;
+      this.increment = increment;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() >= nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = value;
+          value += increment;
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class RandomInteger extends IntegerGenerator {
+
+    private RandomInteger(TypeDescription.Category kind) {
+      super(kind);
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = random.nextLong();
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class IntegerRange extends IntegerGenerator {
+    private final long minimum;
+    private final long range;
+    private final long limit;
+
+    private IntegerRange(TypeDescription.Category kind, long minimum,
+                         long maximum) {
+      super(kind);
+      this.minimum = minimum;
+      this.range = maximum - minimum + 1;
+      if (this.range < 0) {
+        throw new IllegalArgumentException("Can't support a negative range "
+            + range);
+      }
+      limit = (Long.MAX_VALUE / range) * range;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          long rand;
+          do {
+            // clear the sign bit
+            rand = random.nextLong() & Long.MAX_VALUE;
+          } while (rand >= limit);
+          vector.vector[r] = (rand % range) + minimum;
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class StringChooser extends ValueGenerator {
+    private final byte[][] choices;
+    private StringChooser(String[] values) {
+      choices = new byte[values.length][];
+      for(int e=0; e < values.length; ++e) {
+        choices[e] = values[e].getBytes();
+      }
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      BytesColumnVector vector = (BytesColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          int val = random.nextInt(choices.length);
+          vector.setRef(r, choices[val], 0, choices[val].length);
+        }
+      }
+    }
+  }
+
+  private static byte[] concat(byte[] left, byte[] right) {
+    byte[] result = new byte[left.length + right.length];
+    System.arraycopy(left, 0, result, 0, left.length);
+    System.arraycopy(right, 0, result, left.length, right.length);
+    return result;
+  }
+
+  private static byte pickOne(byte[] choices, Random random) {
+    return choices[random.nextInt(choices.length)];
+  }
+
+  private static final byte[] LOWER_CONSONANTS =
+      "bcdfghjklmnpqrstvwxyz".getBytes();
+  private static final byte[] UPPER_CONSONANTS =
+      "BCDFGHJKLMNPQRSTVWXYZ".getBytes();
+  private static final byte[] CONSONANTS =
+      concat(LOWER_CONSONANTS, UPPER_CONSONANTS);
+  private static final byte[] LOWER_VOWELS = "aeiou".getBytes();
+  private static final byte[] UPPER_VOWELS = "AEIOU".getBytes();
+  private static final byte[] VOWELS = concat(LOWER_VOWELS, UPPER_VOWELS);
+  private static final byte[] LOWER_LETTERS =
+      concat(LOWER_CONSONANTS, LOWER_VOWELS);
+  private static final byte[] UPPER_LETTERS =
+      concat(UPPER_CONSONANTS, UPPER_VOWELS);
+  private static final byte[] LETTERS = concat(LOWER_LETTERS, UPPER_LETTERS);
+  private static final byte[] NATURAL_DIGITS = "123456789".getBytes();
+  private static final byte[] DIGITS = "0123456789".getBytes();
+
+  private class StringPattern extends ValueGenerator {
+    private final byte[] buffer;
+    private final byte[][] choices;
+    private final int[] locations;
+
+    private StringPattern(String pattern) {
+      buffer = pattern.getBytes();
+      int locs = 0;
+      for(int i=0; i < buffer.length; ++i) {
+        switch (buffer[i]) {
+          case 'C':
+          case 'c':
+          case 'E':
+          case 'V':
+          case 'v':
+          case 'F':
+          case 'l':
+          case 'L':
+          case 'D':
+          case 'x':
+          case 'X':
+            locs += 1;
+            break;
+          default:
+            break;
+        }
+      }
+      locations = new int[locs];
+      choices = new byte[locs][];
+      locs = 0;
+      for(int i=0; i < buffer.length; ++i) {
+        switch (buffer[i]) {
+          case 'C':
+            locations[locs] = i;
+            choices[locs++] = UPPER_CONSONANTS;
+            break;
+          case 'c':
+            locations[locs] = i;
+            choices[locs++] = LOWER_CONSONANTS;
+            break;
+          case 'E':
+            locations[locs] = i;
+            choices[locs++] = CONSONANTS;
+            break;
+          case 'V':
+            locations[locs] = i;
+            choices[locs++] = UPPER_VOWELS;
+            break;
+          case 'v':
+            locations[locs] = i;
+            choices[locs++] = LOWER_VOWELS;
+            break;
+          case 'F':
+            locations[locs] = i;
+            choices[locs++] = VOWELS;
+            break;
+          case 'l':
+            locations[locs] = i;
+            choices[locs++] = LOWER_LETTERS;
+            break;
+          case 'L':
+            locations[locs] = i;
+            choices[locs++] = UPPER_LETTERS;
+            break;
+          case 'D':
+            locations[locs] = i;
+            choices[locs++] = LETTERS;
+            break;
+          case 'x':
+            locations[locs] = i;
+            choices[locs++] = NATURAL_DIGITS;
+            break;
+          case 'X':
+            locations[locs] = i;
+            choices[locs++] = DIGITS;
+            break;
+          default:
+            break;
+        }
+      }
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      BytesColumnVector vector = (BytesColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          for(int m=0; m < locations.length; ++m) {
+            buffer[locations[m]] = pickOne(choices[m], random);
+          }
+          vector.setVal(r, buffer, 0, buffer.length);
+        }
+      }
+    }
+  }
+
+  private class TimestampRange extends ValueGenerator {
+    private final long minimum;
+    private final long range;
+    private final long limit;
+
+    private TimestampRange(String min, String max) {
+      minimum = Timestamp.valueOf(min).getTime();
+      range = Timestamp.valueOf(max).getTime() - minimum + 1;
+      if (range < 0) {
+        throw new IllegalArgumentException("Negative range " + range);
+      }
+      limit = (Long.MAX_VALUE / range) * range;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      TimestampColumnVector vector = (TimestampColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          long rand;
+          do {
+            // clear the sign bit
+            rand = random.nextLong() & Long.MAX_VALUE;
+          } while (rand >= limit);
+          vector.time[r] = (rand % range) + minimum;
+          vector.nanos[r] = random.nextInt(1000000);
+        }
+      }
+    }
+  }
+
+  private static int getIntegerLength(TypeDescription.Category kind) {
+    switch (kind) {
+      case BYTE:
+        return 8;
+      case SHORT:
+        return 16;
+      case INT:
+        return 32;
+      case LONG:
+        return 64;
+      default:
+        throw new IllegalArgumentException("Unhandled type " + kind);
+    }
+  }
+
+  public class Field {
+    private final TypeDescription type;
+    private Field[] children;
+    private ValueGenerator generator;
+
+    private Field(TypeDescription type) {
+      this.type = type;
+      if (!type.getCategory().isPrimitive()) {
+        List<TypeDescription> childrenTypes = type.getChildren();
+        children = new Field[childrenTypes.size()];
+        for(int c=0; c < children.length; ++c) {
+          children[c] = new Field(childrenTypes.get(c));
+        }
+      }
+    }
+
+    public Field addAutoIncrement(long start, long increment) {
+      generator = new AutoIncrement(type.getCategory(), start, increment);
+      return this;
+    }
+
+    public Field addIntegerRange(long min, long max) {
+      generator = new IntegerRange(type.getCategory(), min, max);
+      return this;
+    }
+
+    public Field addRandomInt() {
+      generator = new RandomInteger(type.getCategory());
+      return this;
+    }
+
+    public Field addStringChoice(String... choices) {
+      if (type.getCategory() != TypeDescription.Category.STRING) {
+        throw new IllegalArgumentException("Must be string - " + type);
+      }
+      generator = new StringChooser(choices);
+      return this;
+    }
+
+    public Field addStringPattern(String pattern) {
+      if (type.getCategory() != TypeDescription.Category.STRING) {
+        throw new IllegalArgumentException("Must be string - " + type);
+      }
+      generator = new StringPattern(pattern);
+      return this;
+    }
+
+    public Field addTimestampRange(String start, String end) {
+      if (type.getCategory() != TypeDescription.Category.TIMESTAMP) {
+        throw new IllegalArgumentException("Must be timestamp - " + type);
+      }
+      generator = new TimestampRange(start, end);
+      return this;
+    }
+
+    public Field addBoolean() {
+      if (type.getCategory() != TypeDescription.Category.BOOLEAN) {
+        throw new IllegalArgumentException("Must be boolean - " + type);
+      }
+      generator = new RandomBoolean();
+      return this;
+    }
+
+    public Field hasNulls(double probability) {
+      generator.nullProbability = probability;
+      return this;
+    }
+
+    public Field addStruct() {
+      generator = new RandomStruct(children);
+      return this;
+    }
+
+    public Field addList(int minSize, int maxSize) {
+      generator = new RandomList(minSize, maxSize, children[0]);
+      return this;
+    }
+
+    public Field getChildField(int child) {
+      return children[child];
+    }
+  }
+
+  public Field addField(String name, TypeDescription.Category kind) {
+    TypeDescription type = new TypeDescription(kind);
+    return addField(name, type);
+  }
+
+  public Field addField(String name, TypeDescription type) {
+    schema.addField(name, type);
+    Field result = new Field(type);
+    fields.add(result);
+    return result;
+  }
+
+  public void generate(VectorizedRowBatch batch, int rowCount) {
+    batch.reset();
+    for(int c=0; c < batch.cols.length; ++c) {
+      fields.get(c).generator.generate(batch.cols[c], rowCount);
+    }
+    batch.size = rowCount;
+  }
+
+  /**
+   * Get the schema for the table that is being generated.
+   * @return
+   */
+  public TypeDescription getSchema() {
+    return schema;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java 
b/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
new file mode 100644
index 0000000..56b7f5e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
@@ -0,0 +1,200 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+public class SalesGenerator {
+  private final RandomGenerator generator;
+  private long rowsRemaining;
+  private final static double MOSTLY = 0.99999;
+
+  public SalesGenerator(long rows) {
+    this(rows, 42);
+  }
+
+  public SalesGenerator(long rows, int seed) {
+    generator = new RandomGenerator(seed);
+    // column 1
+    generator.addField("sales_id", TypeDescription.Category.LONG)
+        .addAutoIncrement(1000000000, 1);
+    generator.addField("customer_id", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 2000000000);
+    generator.addField("col3", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 10000).hasNulls(0.9993100389335173);
+
+    // column 4
+    generator.addField("item_category", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.00014784879996054823);
+    generator.addField("item_count", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000);
+    generator.addField("change_ts", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 7
+    generator.addField("store_location", TypeDescription.Category.STRING)
+        .addStringChoice("Los Angeles", "New York", "Cupertino", "Sunnyvale",
+            "Boston", "Chicago", "Seattle", "Jackson",
+            "Palo Alto", "San Mateo", "San Jose", "Santa Clara",
+            "Irvine", "Torrance", "Gardena", "Hermosa", "Manhattan")
+        .hasNulls(0.0004928293332019384);
+    generator.addField("associate_id", TypeDescription.Category.STRING)
+        .addStringPattern("MR V").hasNulls(0.05026859198659506);
+    generator.addField("col9", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000).hasNulls(MOSTLY);
+
+    // column 10
+    generator.addField("rebate_id", TypeDescription.Category.STRING)
+        .addStringPattern("xxxxxx").hasNulls(MOSTLY);
+    generator.addField("create_ts", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+    generator.addField("col13", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000).hasNulls(MOSTLY);
+
+    // column 13
+    generator.addField("size", TypeDescription.Category.STRING)
+        .addStringChoice("Small", "Medium", "Large", "XL")
+        .hasNulls(0.9503720861465674);
+    generator.addField("col14", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000);
+    generator.addField("fulfilled", TypeDescription.Category.BOOLEAN)
+        .addBoolean();
+
+    // column 16
+    generator.addField("global_id", TypeDescription.Category.STRING)
+        .addStringPattern("xxxxxxxxxxxxxxxx").hasNulls(0.021388793060962974);
+    generator.addField("col17", TypeDescription.Category.STRING)
+        .addStringPattern("L-xx").hasNulls(MOSTLY);
+    generator.addField("col18", TypeDescription.Category.STRING)
+        .addStringPattern("ll").hasNulls(MOSTLY);
+
+    // column 19
+    generator.addField("col19", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000);
+    generator.addField("has_rebate", TypeDescription.Category.BOOLEAN)
+        .addBoolean();
+    RandomGenerator.Field list =
+        generator.addField("col21",
+        TypeDescription.fromString("array<struct<sub1:bigint,sub2:string," +
+            "sub3:string,sub4:bigint,sub5:bigint,sub6:string>>"))
+        .addList(0, 3)
+        .hasNulls(MOSTLY);
+    RandomGenerator.Field struct = list.getChildField(0).addStruct();
+    struct.getChildField(0).addIntegerRange(0, 10000000);
+    struct.getChildField(1).addStringPattern("VVVVV");
+    struct.getChildField(2).addStringPattern("VVVVVVVV");
+    struct.getChildField(3).addIntegerRange(0, 10000000);
+    struct.getChildField(4).addIntegerRange(0, 10000000);
+    struct.getChildField(5).addStringPattern("VVVVVVVV");
+
+    // column 38
+    generator.addField("vendor_id", TypeDescription.Category.STRING)
+        .addStringPattern("Lxxxxxx").hasNulls(0.1870780148834459);
+    generator.addField("country", TypeDescription.Category.STRING)
+        .addStringChoice("USA", "Germany", "Ireland", "Canada", "Mexico",
+            "Denmark").hasNulls(0.0004928293332019384);
+
+    // column 40
+    generator.addField("backend_version", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.0005913951998423039);
+    generator.addField("col41", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 100000000000L);
+    generator.addField("col42", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000);
+
+    // column 43
+    generator.addField("col43", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 
10000000000L).hasNulls(0.9763934749396284);
+    generator.addField("col44", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000000);
+    generator.addField("col45", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000000);
+
+    // column 46
+    generator.addField("col46", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 10000000);
+    generator.addField("col47", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000);
+    generator.addField("col48", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+
+    // column 49
+    generator.addField("col49", TypeDescription.Category.STRING)
+        .addStringPattern("xxxx").hasNulls(0.0004928293332019384);
+    generator.addField("col50", TypeDescription.Category.STRING)
+        .addStringPattern("ll").hasNulls(0.9496821250800848);
+    generator.addField("col51", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.9999014341333596);
+
+    // column 52
+    generator.addField("col52", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.9980779656005125);
+    generator.addField("col53", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000);
+    generator.addField("col54", TypeDescription.Category.LONG)
+        .addIntegerRange(1,  1000000000);
+
+    // column 55
+    generator.addField("col55", TypeDescription.Category.STRING)
+        .addStringChoice("X");
+    generator.addField("col56", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+    generator.addField("col57", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 58
+    generator.addField("md5", TypeDescription.Category.LONG)
+        .addRandomInt();
+    generator.addField("col59", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 10000000000L);
+    generator.addField("col69", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59")
+        .hasNulls(MOSTLY);
+
+    // column 61
+    generator.addField("col61", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.11399142476960233);
+    generator.addField("col62", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.9986200778670347);
+    generator.addField("col63", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 64
+    generator.addField("col64", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+    rowsRemaining = rows;
+  }
+
+  public boolean nextBatch(VectorizedRowBatch batch) {
+    int rows = (int) Math.min(batch.getMaxSize(), rowsRemaining);
+    generator.generate(batch, rows);
+    rowsRemaining -= rows;
+    return rows != 0;
+  }
+
+  public TypeDescription getSchema() {
+    return generator.getSchema();
+  }
+
+  public static void main(String[] args) throws Exception {
+    SalesGenerator sales = new SalesGenerator(10, 42);
+    System.out.println("Schema " + sales.getSchema());
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java
new file mode 100644
index 0000000..d4fd4a2
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToAvro.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+public class SalesToAvro {
+
+  public static void main(String[] args) throws Exception {
+    Configuration conf = new Configuration();
+    SalesGenerator sales = new SalesGenerator(Long.parseLong(args[2]));
+    TypeDescription schema = sales.getSchema();
+    AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf,
+        TaxiToAvro.getCodec(args[1]));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    while (sales.nextBatch(batch)) {
+      writer.writeBatch(batch);
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToJson.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
new file mode 100644
index 0000000..500b6c9
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToJson.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.tools.FileDump;
+
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.zip.GZIPOutputStream;
+
+public class SalesToJson {
+
+  public static void main(String[] args) throws Exception {
+    SalesGenerator sales = new SalesGenerator(Long.parseLong(args[2]));
+    TypeDescription schema = sales.getSchema();
+    Path path = new Path(args[0]);
+    VectorizedRowBatch batch = schema.createRowBatch();
+    Configuration conf = new Configuration();
+    Writer output = new OutputStreamWriter(TaxiToJson.getCodec(args[1])
+        .create(path.getFileSystem(conf).create(path)));
+    while (sales.nextBatch(batch)) {
+      for(int r=0; r < batch.size; ++r) {
+        FileDump.printRow(output, batch, schema, r);
+        output.write("\n");
+      }
+    }
+    output.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
new file mode 100644
index 0000000..d570728
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToOrc.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+
+public class SalesToOrc {
+  public static void main(String[] args) throws Exception {
+    SalesGenerator sales = new SalesGenerator(Long.parseLong(args[2]));
+    VectorizedRowBatch batch = sales.getSchema().createRowBatch();
+    Configuration conf = new Configuration();
+    Writer writer = OrcFile.createWriter(new Path(args[0]),
+        OrcFile.writerOptions(conf)
+            .setSchema(sales.getSchema())
+            .compress(TaxiToOrc.getCodec(args[1])));
+    while (sales.nextBatch(batch)) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/SalesToParquet.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesToParquet.java 
b/java/bench/src/java/org/apache/orc/bench/SalesToParquet.java
new file mode 100644
index 0000000..985da90
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesToParquet.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.orc.VectorToWritable;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.TypeDescription;
+
+import java.util.Properties;
+
+public class SalesToParquet {
+
+  public static void main(String[] args) throws Exception {
+    JobConf conf = new JobConf();
+    conf.set("mapred.task.id", "attempt_0_0_m_0_0");
+    conf.set("parquet.compression", TaxiToParquet.getCodec(args[1]));
+    SalesGenerator sales = new SalesGenerator(Long.parseLong(args[2]));
+    TypeDescription schema = sales.getSchema();
+    VectorizedRowBatch batch = schema.createRowBatch();
+    Path path = new Path(args[0]);
+    Properties properties = AvroWriter.setHiveSchema(schema);
+    MapredParquetOutputFormat format = new MapredParquetOutputFormat();
+    FileSinkOperator.RecordWriter writer = format.getHiveRecordWriter(conf,
+        path, ParquetHiveRecord.class, !"none".equals(args[1]), properties,
+        Reporter.NULL);
+    ParquetHiveRecord record = new ParquetHiveRecord();
+    record.inspector =
+        (StructObjectInspector) VectorToWritable.createObjectInspector(schema);
+    while (sales.nextBatch(batch)) {
+      for(int r=0; r < batch.size; ++r) {
+        record.value = VectorToWritable.createValue(batch, r, schema,
+            record.value);
+        writer.write(record);
+      }
+    }
+    writer.close(true);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java
new file mode 100644
index 0000000..b490a8a
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToAvro.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+public class TaxiToAvro {
+
+  public static String getCodec(String compression) {
+    if ("none".equals(compression)) {
+      return "null";
+    } else if ("zlib".equals(compression)) {
+      return "deflate";
+    } else if ("snappy".equals(compression)) {
+      return "snappy";
+    }
+    throw new IllegalArgumentException("Unknown compression " + compression);
+  }
+
+  public static void main(String[] args) throws Exception {
+    TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema");
+    Configuration conf = new Configuration();
+    AvroWriter writer = new AvroWriter(new Path(args[0]), schema, conf,
+        getCodec(args[1]));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
+      CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
+      while (reader.nextBatch(batch)) {
+        writer.writeBatch(batch);
+      }
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
new file mode 100644
index 0000000..98fbe17
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToJson.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.tools.FileDump;
+import org.iq80.snappy.SnappyOutputStream;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.zip.GZIPOutputStream;
+
+public class TaxiToJson {
+
+  public enum CompressionKind {
+    NONE(""),
+    ZLIB(".gz"),
+    SNAPPY(".snappy");
+
+    CompressionKind(String extendsion) {
+      this.extension = extendsion;
+    }
+
+    private final String extension;
+
+    public String getExtension() {
+      return extension;
+    }
+
+    public OutputStream create(OutputStream out) throws IOException {
+      switch (this) {
+        case NONE:
+          return out;
+        case ZLIB:
+          return new GZIPOutputStream(out);
+        case SNAPPY:
+          return new SnappyOutputStream(out);
+        default:
+          throw new IllegalArgumentException("Unhandled kind " + this);
+      }
+    }
+  }
+
+  public static CompressionKind getCodec(String name) {
+    if ("none".equals(name)) {
+      return CompressionKind.NONE;
+    } else if ("zlib".equals(name)) {
+      return CompressionKind.ZLIB;
+    } else if ("snappy".equals(name)) {
+      return CompressionKind.SNAPPY;
+    } throw new IllegalArgumentException("Unhnadled kind " + name);
+  }
+
+  public static void main(String[] args) throws Exception {
+    TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema");
+    Path path = new Path(args[0]);
+    VectorizedRowBatch batch = schema.createRowBatch();
+    Configuration conf = new Configuration();
+    Writer output = new OutputStreamWriter(getCodec(args[1])
+        .create(path.getFileSystem(conf).create(path)));
+    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
+      CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
+      while (reader.nextBatch(batch)) {
+        for(int r=0; r < batch.size; ++r) {
+          FileDump.printRow(output, batch, schema, r);
+          output.write("\n");
+        }
+      }
+    }
+    output.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
new file mode 100644
index 0000000..8f66c04
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToOrc.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.orc.OrcFile;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+public class TaxiToOrc {
+
+  public static TypeDescription loadSchema(String name) throws IOException {
+    InputStream in = 
TaxiToOrc.class.getClassLoader().getResourceAsStream(name);
+    byte[] buffer= new byte[1 * 1024];
+    int len = in.read(buffer);
+    StringBuilder string = new StringBuilder();
+    while (len > 0) {
+      for(int i=0; i < len; ++i) {
+        // strip out
+        if (buffer[i] != '\n' && buffer[i] != ' ') {
+          string.append((char) buffer[i]);
+        }
+      }
+      len = in.read(buffer);
+    }
+    return TypeDescription.fromString(string.toString());
+  }
+
+  public static CompressionKind getCodec(String compression) {
+    if ("none".equals(compression)) {
+      return CompressionKind.NONE;
+    } else if ("zlib".equals(compression)) {
+      return CompressionKind.ZLIB;
+    } else if ("snappy".equals(compression)) {
+      return CompressionKind.SNAPPY;
+    } else {
+      throw new IllegalArgumentException("Unknown compression " + compression);
+    }
+  }
+
+  public static Iterable<String> sliceArray(final String[] array,
+                                            final int start) {
+    return new Iterable<String>() {
+      String[] values = array;
+      int posn = start;
+
+      @Override
+      public Iterator<String> iterator() {
+        return new Iterator<String>() {
+          @Override
+          public boolean hasNext() {
+            return posn < values.length;
+          }
+
+          @Override
+          public String next() {
+            return values[posn++];
+          }
+
+          @Override
+          public void remove() {
+            throw new UnsupportedOperationException("No remove");
+          }
+        };
+      }
+    };
+  }
+
+  public static void main(String[] args) throws Exception {
+    TypeDescription schema = loadSchema("nyc-taxi.schema");
+    VectorizedRowBatch batch = schema.createRowBatch();
+    Configuration conf = new Configuration();
+    Writer writer = OrcFile.createWriter(new Path(args[0]),
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .compress(getCodec(args[1])));
+    for(String inFile: sliceArray(args, 2)) {
+      CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
+      while (reader.nextBatch(batch)) {
+        writer.addRowBatch(batch);
+      }
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/java/org/apache/orc/bench/TaxiToParquet.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/TaxiToParquet.java 
b/java/bench/src/java/org/apache/orc/bench/TaxiToParquet.java
new file mode 100644
index 0000000..3edce17
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/TaxiToParquet.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.orc.VectorToWritable;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.TypeDescription;
+
+import java.util.Properties;
+public class TaxiToParquet {
+
+  public static String getCodec(String name) {
+    if ("none".equals(name)) {
+      return "UNCOMPRESSED";
+    } else if ("zlib".equals(name)) {
+      return "GZIP";
+    } else if ("snappy".equals(name)) {
+      return "SNAPPY";
+    } else {
+      throw new IllegalArgumentException("Unknown compression " + name);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    TypeDescription schema = TaxiToOrc.loadSchema("nyc-taxi.schema");
+    VectorizedRowBatch batch = schema.createRowBatch();
+    JobConf conf = new JobConf();
+    conf.set("mapred.task.id", "attempt_0_0_m_0_0");
+    conf.set("parquet.compression", getCodec(args[1]));
+    boolean isCompressed = true;
+    Path path = new Path(args[0]);
+    Properties properties = AvroWriter.setHiveSchema(schema);
+    MapredParquetOutputFormat format = new MapredParquetOutputFormat();
+    FileSinkOperator.RecordWriter writer = format.getHiveRecordWriter(conf,
+        path, ParquetHiveRecord.class, !"none".equals(args[1]), properties,
+        Reporter.NULL);
+    ParquetHiveRecord record = new ParquetHiveRecord();
+    record.inspector =
+        (StructObjectInspector) VectorToWritable.createObjectInspector(schema);
+    for(String inFile: TaxiToOrc.sliceArray(args, 2)) {
+      CsvReader reader = new CsvReader(new Path(inFile), conf, schema);
+      while (reader.nextBatch(batch)) {
+        for(int r=0; r < batch.size; ++r) {
+          record.value = VectorToWritable.createValue(batch, r, schema,
+              record.value);
+          writer.write(record);
+        }
+      }
+    }
+    writer.close(true);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/main/resources/github.schema
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/github.schema 
b/java/bench/src/main/resources/github.schema
new file mode 100644
index 0000000..3b7dd15
--- /dev/null
+++ b/java/bench/src/main/resources/github.schema
@@ -0,0 +1,702 @@
+struct<
+  actor:struct <
+    avatar_url: string,
+    gravatar_id: string,
+    id: int,
+    login: string,
+    url: string>,
+  created_at:timestamp,
+  id:binary,
+  org:struct <
+    avatar_url: string,
+    gravatar_id: string,
+    id: int,
+    login: string,
+    url: string>,
+  payload:struct <
+    action: string,
+    before: binary,
+    comment: struct <
+      _links: struct <
+        html: struct <
+          href: string>,
+        pull_request: struct <
+          href: string>,
+        self: struct <
+          href: string>>,
+      body: string,
+      commit_id: binary,
+      created_at: timestamp,
+      diff_hunk: string,
+      html_url: string,
+      id: int,
+      issue_url: string,
+      line: int,
+      original_commit_id: binary,
+      original_position: int,
+      path: string,
+      position: int,
+      pull_request_url: string,
+      updated_at: timestamp,
+      url: string,
+      user: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>>,
+    commits: array <struct <
+        author: struct <
+          email: string,
+          name: string>,
+        distinct: boolean,
+        message: string,
+        sha: binary,
+        url: string>>,
+    description: string,
+    distinct_size: int,
+    forkee: struct <
+      archive_url: string,
+      assignees_url: string,
+      blobs_url: string,
+      branches_url: string,
+      clone_url: string,
+      collaborators_url: string,
+      comments_url: string,
+      commits_url: string,
+      compare_url: string,
+      contents_url: string,
+      contributors_url: string,
+      created_at: timestamp,
+      default_branch: string,
+      description: string,
+      downloads_url: string,
+      events_url: string,
+      fork: boolean,
+      forks: int,
+      forks_count: int,
+      forks_url: string,
+      full_name: string,
+      git_commits_url: string,
+      git_refs_url: string,
+      git_tags_url: string,
+      git_url: string,
+      has_downloads: boolean,
+      has_issues: boolean,
+      has_pages: boolean,
+      has_wiki: boolean,
+      homepage: string,
+      hooks_url: string,
+      html_url: string,
+      id: int,
+      issue_comment_url: string,
+      issue_events_url: string,
+      issues_url: string,
+      keys_url: string,
+      labels_url: string,
+      language: string,
+      languages_url: string,
+      merges_url: string,
+      milestones_url: string,
+      mirror_url: string,
+      name: string,
+      notifications_url: string,
+      open_issues: int,
+      open_issues_count: int,
+      owner: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>,
+      private: boolean,
+      public: boolean,
+      pulls_url: string,
+      pushed_at: timestamp,
+      releases_url: string,
+      size: int,
+      ssh_url: string,
+      stargazers_count: int,
+      stargazers_url: string,
+      statuses_url: string,
+      subscribers_url: string,
+      subscription_url: string,
+      svn_url: string,
+      tags_url: string,
+      teams_url: string,
+      trees_url: string,
+      updated_at: timestamp,
+      url: string,
+      watchers: int,
+      watchers_count: int>,
+    head: binary,
+    issue: struct <
+      assignee: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>,
+      body: string,
+      closed_at: timestamp,
+      comments: int,
+      comments_url: string,
+      created_at: timestamp,
+      events_url: string,
+      html_url: string,
+      id: int,
+      labels: array <struct <
+          color: binary,
+          name: string,
+          url: string>>,
+      labels_url: string,
+      locked: boolean,
+      milestone: struct <
+        closed_at: timestamp,
+        closed_issues: int,
+        created_at: timestamp,
+        creator: struct <
+          avatar_url: string,
+          events_url: string,
+          followers_url: string,
+          following_url: string,
+          gists_url: string,
+          gravatar_id: string,
+          html_url: string,
+          id: int,
+          login: string,
+          organizations_url: string,
+          received_events_url: string,
+          repos_url: string,
+          site_admin: boolean,
+          starred_url: string,
+          subscriptions_url: string,
+          type: string,
+          url: string>,
+        description: string,
+        due_on: timestamp,
+        html_url: string,
+        id: int,
+        labels_url: string,
+        number: int,
+        open_issues: int,
+        state: string,
+        title: string,
+        updated_at: timestamp,
+        url: string>,
+      number: int,
+      pull_request: struct <
+        diff_url: string,
+        html_url: string,
+        patch_url: string,
+        url: string>,
+      state: string,
+      title: string,
+      updated_at: timestamp,
+      url: string,
+      user: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>>,
+    master_branch: string,
+    member: struct <
+      avatar_url: string,
+      events_url: string,
+      followers_url: string,
+      following_url: string,
+      gists_url: string,
+      gravatar_id: string,
+      html_url: string,
+      id: int,
+      login: string,
+      organizations_url: string,
+      received_events_url: string,
+      repos_url: string,
+      site_admin: boolean,
+      starred_url: string,
+      subscriptions_url: string,
+      type: string,
+      url: string>,
+    number: int,
+    pages: array <struct <
+        action: string,
+        html_url: string,
+        page_name: string,
+        sha: binary,
+        summary: string,
+        title: string>>,
+    pull_request: struct <
+      _links: struct <
+        comments: struct <
+          href: string>,
+        commits: struct <
+          href: string>,
+        html: struct <
+          href: string>,
+        issue: struct <
+          href: string>,
+        review_comment: struct <
+          href: string>,
+        review_comments: struct <
+          href: string>,
+        self: struct <
+          href: string>,
+        statuses: struct <
+          href: string>>,
+      additions: int,
+      assignee: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>,
+      base: struct <
+        label: string,
+        ref: string,
+        repo: struct <
+          archive_url: string,
+          assignees_url: string,
+          blobs_url: string,
+          branches_url: string,
+          clone_url: string,
+          collaborators_url: string,
+          comments_url: string,
+          commits_url: string,
+          compare_url: string,
+          contents_url: string,
+          contributors_url: string,
+          created_at: timestamp,
+          default_branch: string,
+          description: string,
+          downloads_url: string,
+          events_url: string,
+          fork: boolean,
+          forks: int,
+          forks_count: int,
+          forks_url: string,
+          full_name: string,
+          git_commits_url: string,
+          git_refs_url: string,
+          git_tags_url: string,
+          git_url: string,
+          has_downloads: boolean,
+          has_issues: boolean,
+          has_pages: boolean,
+          has_wiki: boolean,
+          homepage: string,
+          hooks_url: string,
+          html_url: string,
+          id: int,
+          issue_comment_url: string,
+          issue_events_url: string,
+          issues_url: string,
+          keys_url: string,
+          labels_url: string,
+          language: string,
+          languages_url: string,
+          merges_url: string,
+          milestones_url: string,
+          mirror_url: string,
+          name: string,
+          notifications_url: string,
+          open_issues: int,
+          open_issues_count: int,
+          owner: struct <
+            avatar_url: string,
+            events_url: string,
+            followers_url: string,
+            following_url: string,
+            gists_url: string,
+            gravatar_id: string,
+            html_url: string,
+            id: int,
+            login: string,
+            organizations_url: string,
+            received_events_url: string,
+            repos_url: string,
+            site_admin: boolean,
+            starred_url: string,
+            subscriptions_url: string,
+            type: string,
+            url: string>,
+          private: boolean,
+          pulls_url: string,
+          pushed_at: timestamp,
+          releases_url: string,
+          size: int,
+          ssh_url: string,
+          stargazers_count: int,
+          stargazers_url: string,
+          statuses_url: string,
+          subscribers_url: string,
+          subscription_url: string,
+          svn_url: string,
+          tags_url: string,
+          teams_url: string,
+          trees_url: string,
+          updated_at: timestamp,
+          url: string,
+          watchers: int,
+          watchers_count: int>,
+        sha: binary,
+        user: struct <
+          avatar_url: string,
+          events_url: string,
+          followers_url: string,
+          following_url: string,
+          gists_url: string,
+          gravatar_id: string,
+          html_url: string,
+          id: int,
+          login: string,
+          organizations_url: string,
+          received_events_url: string,
+          repos_url: string,
+          site_admin: boolean,
+          starred_url: string,
+          subscriptions_url: string,
+          type: string,
+          url: string>>,
+      body: string,
+      changed_files: int,
+      closed_at: timestamp,
+      comments: int,
+      comments_url: string,
+      commits: int,
+      commits_url: string,
+      created_at: timestamp,
+      deletions: int,
+      diff_url: string,
+      head: struct <
+        label: string,
+        ref: string,
+        repo: struct <
+          archive_url: string,
+          assignees_url: string,
+          blobs_url: string,
+          branches_url: string,
+          clone_url: string,
+          collaborators_url: string,
+          comments_url: string,
+          commits_url: string,
+          compare_url: string,
+          contents_url: string,
+          contributors_url: string,
+          created_at: timestamp,
+          default_branch: string,
+          description: string,
+          downloads_url: string,
+          events_url: string,
+          fork: boolean,
+          forks: int,
+          forks_count: int,
+          forks_url: string,
+          full_name: string,
+          git_commits_url: string,
+          git_refs_url: string,
+          git_tags_url: string,
+          git_url: string,
+          has_downloads: boolean,
+          has_issues: boolean,
+          has_pages: boolean,
+          has_wiki: boolean,
+          homepage: string,
+          hooks_url: string,
+          html_url: string,
+          id: int,
+          issue_comment_url: string,
+          issue_events_url: string,
+          issues_url: string,
+          keys_url: string,
+          labels_url: string,
+          language: string,
+          languages_url: string,
+          merges_url: string,
+          milestones_url: string,
+          mirror_url: string,
+          name: string,
+          notifications_url: string,
+          open_issues: int,
+          open_issues_count: int,
+          owner: struct <
+            avatar_url: string,
+            events_url: string,
+            followers_url: string,
+            following_url: string,
+            gists_url: string,
+            gravatar_id: string,
+            html_url: string,
+            id: int,
+            login: string,
+            organizations_url: string,
+            received_events_url: string,
+            repos_url: string,
+            site_admin: boolean,
+            starred_url: string,
+            subscriptions_url: string,
+            type: string,
+            url: string>,
+          private: boolean,
+          pulls_url: string,
+          pushed_at: timestamp,
+          releases_url: string,
+          size: int,
+          ssh_url: string,
+          stargazers_count: int,
+          stargazers_url: string,
+          statuses_url: string,
+          subscribers_url: string,
+          subscription_url: string,
+          svn_url: string,
+          tags_url: string,
+          teams_url: string,
+          trees_url: string,
+          updated_at: timestamp,
+          url: string,
+          watchers: int,
+          watchers_count: int>,
+        sha: binary,
+        user: struct <
+          avatar_url: string,
+          events_url: string,
+          followers_url: string,
+          following_url: string,
+          gists_url: string,
+          gravatar_id: string,
+          html_url: string,
+          id: int,
+          login: string,
+          organizations_url: string,
+          received_events_url: string,
+          repos_url: string,
+          site_admin: boolean,
+          starred_url: string,
+          subscriptions_url: string,
+          type: string,
+          url: string>>,
+      html_url: string,
+      id: int,
+      issue_url: string,
+      locked: boolean,
+      merge_commit_sha: string,
+      mergeable: boolean,
+      mergeable_state: string,
+      merged: boolean,
+      merged_at: timestamp,
+      merged_by: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>,
+      milestone: struct <
+        closed_at: timestamp,
+        closed_issues: int,
+        created_at: timestamp,
+        creator: struct <
+          avatar_url: string,
+          events_url: string,
+          followers_url: string,
+          following_url: string,
+          gists_url: string,
+          gravatar_id: string,
+          html_url: string,
+          id: int,
+          login: string,
+          organizations_url: string,
+          received_events_url: string,
+          repos_url: string,
+          site_admin: boolean,
+          starred_url: string,
+          subscriptions_url: string,
+          type: string,
+          url: string>,
+        description: string,
+        due_on: timestamp,
+        html_url: string,
+        id: int,
+        labels_url: string,
+        number: int,
+        open_issues: int,
+        state: string,
+        title: string,
+        updated_at: timestamp,
+        url: string>,
+      number: int,
+      patch_url: string,
+      review_comment_url: string,
+      review_comments: int,
+      review_comments_url: string,
+      state: string,
+      statuses_url: string,
+      title: string,
+      updated_at: timestamp,
+      url: string,
+      user: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>>,
+    push_id: int,
+    pusher_type: string,
+    ref: string,
+    ref_type: string,
+    release: struct <
+      assets: array <struct <
+          browser_download_url: string,
+          content_type: string,
+          created_at: timestamp,
+          download_count: int,
+          id: int,
+          label: string,
+          name: string,
+          size: int,
+          state: string,
+          updated_at: timestamp,
+          uploader: struct <
+            avatar_url: string,
+            events_url: string,
+            followers_url: string,
+            following_url: string,
+            gists_url: string,
+            gravatar_id: string,
+            html_url: string,
+            id: int,
+            login: string,
+            organizations_url: string,
+            received_events_url: string,
+            repos_url: string,
+            site_admin: boolean,
+            starred_url: string,
+            subscriptions_url: string,
+            type: string,
+            url: string>,
+          url: string>>,
+      assets_url: string,
+      author: struct <
+        avatar_url: string,
+        events_url: string,
+        followers_url: string,
+        following_url: string,
+        gists_url: string,
+        gravatar_id: string,
+        html_url: string,
+        id: int,
+        login: string,
+        organizations_url: string,
+        received_events_url: string,
+        repos_url: string,
+        site_admin: boolean,
+        starred_url: string,
+        subscriptions_url: string,
+        type: string,
+        url: string>,
+      body: string,
+      created_at: timestamp,
+      draft: boolean,
+      html_url: string,
+      id: int,
+      name: string,
+      prerelease: boolean,
+      published_at: timestamp,
+      tag_name: string,
+      tarball_url: string,
+      target_commitish: string,
+      upload_url: string,
+      url: string,
+      zipball_url: string>,
+    size: int>,
+  public: boolean,
+  repo: struct <
+    id: int,
+    name: string,
+    url: string>,
+  type: string
+>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/log4j.properties 
b/java/bench/src/main/resources/log4j.properties
new file mode 100644
index 0000000..363917c
--- /dev/null
+++ b/java/bench/src/main/resources/log4j.properties
@@ -0,0 +1,6 @@
+log4j.rootLogger=WARN, CONSOLE
+
+# CONSOLE is set to be a ConsoleAppender using a PatternLayout
+log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
+log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
+log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %m%n
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/bench/src/main/resources/nyc-taxi.schema
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/nyc-taxi.schema 
b/java/bench/src/main/resources/nyc-taxi.schema
new file mode 100644
index 0000000..5eb7c0f
--- /dev/null
+++ b/java/bench/src/main/resources/nyc-taxi.schema
@@ -0,0 +1,21 @@
+struct<
+  vendor_id:int,
+  pickup_time: timestamp,
+  dropoff_time: timestamp,
+  passenger_count: int,
+  trip_distance: double,
+  pickup_longitude: double,
+  pickup_latitude: double,
+  ratecode_id: int,
+  store_and_fwd_flag: string,
+  dropoff_longitude: double,
+  dropoff_latitude: double,
+  payment_type: int,
+  fare_amount: decimal(8,2),
+  extra: decimal(8,2),
+  mta_tax: decimal(8,2),
+  tip_amount: decimal(8,2),
+  tolls_amount: decimal(8,2),
+  improvement_surcharge : decimal(8,2),
+  total_amount: decimal(8,2)
+>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java 
b/java/core/src/java/org/apache/orc/TypeDescription.java
index bc6787d..24d15dc 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -707,7 +707,7 @@ public class TypeDescription
     return startId;
   }
 
-  private TypeDescription(Category category) {
+  public TypeDescription(Category category) {
     this.category = category;
     if (category.isPrimitive) {
       children = null;

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/pom.xml
----------------------------------------------------------------------
diff --git a/java/pom.xml b/java/pom.xml
index 794300d..5236409 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -61,6 +61,7 @@
     <module>core</module>
     <module>mapreduce</module>
     <module>tools</module>
+    <module>bench</module>
   </modules>
 
   <properties>

http://git-wip-us.apache.org/repos/asf/orc/blob/825a9441/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 7206503..907d712 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -754,5 +754,4 @@ public final class FileDump {
         .create());
     return result;
   }
-
 }

Reply via email to