[1/4] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

omalley Thu, 22 Sep 2016 12:22:42 -0700

Repository: orc
Updated Branches:
  refs/heads/master 7118e968b -> 604dcc801



http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java 
b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author 
disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
-  // from 64-bit linear congruential generator
-  public static final long NULL_HASHCODE = 2862933555777941757L;
-
-  // Constants for 32 bit variant
-  private static final int C1_32 = 0xcc9e2d51;
-  private static final int C2_32 = 0x1b873593;
-  private static final int R1_32 = 15;
-  private static final int R2_32 = 13;
-  private static final int M_32 = 5;
-  private static final int N_32 = 0xe6546b64;
-
-  // Constants for 128 bit variant
-  private static final long C1 = 0x87c37b91114253d5L;
-  private static final long C2 = 0x4cf5ad432745937fL;
-  private static final int R1 = 31;
-  private static final int R2 = 27;
-  private static final int R3 = 33;
-  private static final int M = 5;
-  private static final int N1 = 0x52dce729;
-  private static final int N2 = 0x38495ab5;
-
-  private static final int DEFAULT_SEED = 104729;
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data) {
-    return hash32(data, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default 0)
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data, int length, int seed) {
-    int hash = seed;
-    final int nblocks = length >> 2;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      int i_4 = i << 2;
-      int k = (data[i_4] & 0xff)
-          | ((data[i_4 + 1] & 0xff) << 8)
-          | ((data[i_4 + 2] & 0xff) << 16)
-          | ((data[i_4 + 3] & 0xff) << 24);
-
-      // mix functions
-      k *= C1_32;
-      k = Integer.rotateLeft(k, R1_32);
-      k *= C2_32;
-      hash ^= k;
-      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
-    }
-
-    // tail
-    int idx = nblocks << 2;
-    int k1 = 0;
-    switch (length - idx) {
-      case 3:
-        k1 ^= data[idx + 2] << 16;
-      case 2:
-        k1 ^= data[idx + 1] << 8;
-      case 1:
-        k1 ^= data[idx];
-
-        // mix functions
-        k1 *= C1_32;
-        k1 = Integer.rotateLeft(k1, R1_32);
-        k1 *= C2_32;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash ^= (hash >>> 16);
-    hash *= 0x85ebca6b;
-    hash ^= (hash >>> 13);
-    hash *= 0xc2b2ae35;
-    hash ^= (hash >>> 16);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data) {
-    return hash64(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  public static long hash64(byte[] data, int offset, int length) {
-    return hash64(data, offset, length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data, int offset, int length, int seed) {
-    long hash = seed;
-    final int nblocks = length >> 3;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i8 = i << 3;
-      long k = ((long) data[offset + i8] & 0xff)
-          | (((long) data[offset + i8 + 1] & 0xff) << 8)
-          | (((long) data[offset + i8 + 2] & 0xff) << 16)
-          | (((long) data[offset + i8 + 3] & 0xff) << 24)
-          | (((long) data[offset + i8 + 4] & 0xff) << 32)
-          | (((long) data[offset + i8 + 5] & 0xff) << 40)
-          | (((long) data[offset + i8 + 6] & 0xff) << 48)
-          | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
-      // mix functions
-      k *= C1;
-      k = Long.rotateLeft(k, R1);
-      k *= C2;
-      hash ^= k;
-      hash = Long.rotateLeft(hash, R2) * M + N1;
-    }
-
-    // tail
-    long k1 = 0;
-    int tailStart = nblocks << 3;
-    switch (length - tailStart) {
-      case 7:
-        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= ((long) data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash = fmix64(hash);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data) {
-    return hash128(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param offset - the first element of array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data, int offset, int length, int seed) {
-    long h1 = seed;
-    long h2 = seed;
-    final int nblocks = length >> 4;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i16 = i << 4;
-      long k1 = ((long) data[offset + i16] & 0xff)
-          | (((long) data[offset + i16 + 1] & 0xff) << 8)
-          | (((long) data[offset + i16 + 2] & 0xff) << 16)
-          | (((long) data[offset + i16 + 3] & 0xff) << 24)
-          | (((long) data[offset + i16 + 4] & 0xff) << 32)
-          | (((long) data[offset + i16 + 5] & 0xff) << 40)
-          | (((long) data[offset + i16 + 6] & 0xff) << 48)
-          | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
-      long k2 = ((long) data[offset + i16 + 8] & 0xff)
-          | (((long) data[offset + i16 + 9] & 0xff) << 8)
-          | (((long) data[offset + i16 + 10] & 0xff) << 16)
-          | (((long) data[offset + i16 + 11] & 0xff) << 24)
-          | (((long) data[offset + i16 + 12] & 0xff) << 32)
-          | (((long) data[offset + i16 + 13] & 0xff) << 40)
-          | (((long) data[offset + i16 + 14] & 0xff) << 48)
-          | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
-      // mix functions for k1
-      k1 *= C1;
-      k1 = Long.rotateLeft(k1, R1);
-      k1 *= C2;
-      h1 ^= k1;
-      h1 = Long.rotateLeft(h1, R2);
-      h1 += h2;
-      h1 = h1 * M + N1;
-
-      // mix functions for k2
-      k2 *= C2;
-      k2 = Long.rotateLeft(k2, R3);
-      k2 *= C1;
-      h2 ^= k2;
-      h2 = Long.rotateLeft(h2, R1);
-      h2 += h1;
-      h2 = h2 * M + N2;
-    }
-
-    // tail
-    long k1 = 0;
-    long k2 = 0;
-    int tailStart = nblocks << 4;
-    switch (length - tailStart) {
-      case 15:
-        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
-      case 14:
-        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
-      case 13:
-        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
-      case 12:
-        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
-      case 11:
-        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
-      case 10:
-        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
-      case 9:
-        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
-        k2 *= C2;
-        k2 = Long.rotateLeft(k2, R3);
-        k2 *= C1;
-        h2 ^= k2;
-
-      case 8:
-        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
-      case 7:
-        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= (long) (data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        h1 ^= k1;
-    }
-
-    // finalization
-    h1 ^= length;
-    h2 ^= length;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return new long[]{h1, h2};
-  }
-
-  private static long fmix64(long h) {
-    h ^= (h >>> 33);
-    h *= 0xff51afd7ed558ccdL;
-    h ^= (h >>> 33);
-    h *= 0xc4ceb9fe1a85ec53L;
-    h ^= (h >>> 33);
-    return h;
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/orc/util/Murmur3.java 
b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
new file mode 100644
index 0000000..838681c
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+/**
+ * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
+ *
+ * Murmur3 32 and 128 bit variants.
+ * 32-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
+ * 128-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ *
+ * This is a public domain code with no copyrights.
+ * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
+ * "All MurmurHash versions are public domain software, and the author 
disclaims all copyright
+ * to their code."
+ */
+public class Murmur3 {
+  // from 64-bit linear congruential generator
+  public static final long NULL_HASHCODE = 2862933555777941757L;
+
+  // Constants for 32 bit variant
+  private static final int C1_32 = 0xcc9e2d51;
+  private static final int C2_32 = 0x1b873593;
+  private static final int R1_32 = 15;
+  private static final int R2_32 = 13;
+  private static final int M_32 = 5;
+  private static final int N_32 = 0xe6546b64;
+
+  // Constants for 128 bit variant
+  private static final long C1 = 0x87c37b91114253d5L;
+  private static final long C2 = 0x4cf5ad432745937fL;
+  private static final int R1 = 31;
+  private static final int R2 = 27;
+  private static final int R3 = 33;
+  private static final int M = 5;
+  private static final int N1 = 0x52dce729;
+  private static final int N2 = 0x38495ab5;
+
+  private static final int DEFAULT_SEED = 104729;
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data) {
+    return hash32(data, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default 0)
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data, int length, int seed) {
+    int hash = seed;
+    final int nblocks = length >> 2;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      int i_4 = i << 2;
+      int k = (data[i_4] & 0xff)
+          | ((data[i_4 + 1] & 0xff) << 8)
+          | ((data[i_4 + 2] & 0xff) << 16)
+          | ((data[i_4 + 3] & 0xff) << 24);
+
+      // mix functions
+      k *= C1_32;
+      k = Integer.rotateLeft(k, R1_32);
+      k *= C2_32;
+      hash ^= k;
+      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+    }
+
+    // tail
+    int idx = nblocks << 2;
+    int k1 = 0;
+    switch (length - idx) {
+      case 3:
+        k1 ^= data[idx + 2] << 16;
+      case 2:
+        k1 ^= data[idx + 1] << 8;
+      case 1:
+        k1 ^= data[idx];
+
+        // mix functions
+        k1 *= C1_32;
+        k1 = Integer.rotateLeft(k1, R1_32);
+        k1 *= C2_32;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash ^= (hash >>> 16);
+    hash *= 0x85ebca6b;
+    hash ^= (hash >>> 13);
+    hash *= 0xc2b2ae35;
+    hash ^= (hash >>> 16);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data) {
+    return hash64(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  public static long hash64(byte[] data, int offset, int length) {
+    return hash64(data, offset, length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data, int offset, int length, int seed) {
+    long hash = seed;
+    final int nblocks = length >> 3;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i8 = i << 3;
+      long k = ((long) data[offset + i8] & 0xff)
+          | (((long) data[offset + i8 + 1] & 0xff) << 8)
+          | (((long) data[offset + i8 + 2] & 0xff) << 16)
+          | (((long) data[offset + i8 + 3] & 0xff) << 24)
+          | (((long) data[offset + i8 + 4] & 0xff) << 32)
+          | (((long) data[offset + i8 + 5] & 0xff) << 40)
+          | (((long) data[offset + i8 + 6] & 0xff) << 48)
+          | (((long) data[offset + i8 + 7] & 0xff) << 56);
+
+      // mix functions
+      k *= C1;
+      k = Long.rotateLeft(k, R1);
+      k *= C2;
+      hash ^= k;
+      hash = Long.rotateLeft(hash, R2) * M + N1;
+    }
+
+    // tail
+    long k1 = 0;
+    int tailStart = nblocks << 3;
+    switch (length - tailStart) {
+      case 7:
+        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= ((long) data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash = fmix64(hash);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data) {
+    return hash128(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param offset - the first element of array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data, int offset, int length, int seed) {
+    long h1 = seed;
+    long h2 = seed;
+    final int nblocks = length >> 4;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i16 = i << 4;
+      long k1 = ((long) data[offset + i16] & 0xff)
+          | (((long) data[offset + i16 + 1] & 0xff) << 8)
+          | (((long) data[offset + i16 + 2] & 0xff) << 16)
+          | (((long) data[offset + i16 + 3] & 0xff) << 24)
+          | (((long) data[offset + i16 + 4] & 0xff) << 32)
+          | (((long) data[offset + i16 + 5] & 0xff) << 40)
+          | (((long) data[offset + i16 + 6] & 0xff) << 48)
+          | (((long) data[offset + i16 + 7] & 0xff) << 56);
+
+      long k2 = ((long) data[offset + i16 + 8] & 0xff)
+          | (((long) data[offset + i16 + 9] & 0xff) << 8)
+          | (((long) data[offset + i16 + 10] & 0xff) << 16)
+          | (((long) data[offset + i16 + 11] & 0xff) << 24)
+          | (((long) data[offset + i16 + 12] & 0xff) << 32)
+          | (((long) data[offset + i16 + 13] & 0xff) << 40)
+          | (((long) data[offset + i16 + 14] & 0xff) << 48)
+          | (((long) data[offset + i16 + 15] & 0xff) << 56);
+
+      // mix functions for k1
+      k1 *= C1;
+      k1 = Long.rotateLeft(k1, R1);
+      k1 *= C2;
+      h1 ^= k1;
+      h1 = Long.rotateLeft(h1, R2);
+      h1 += h2;
+      h1 = h1 * M + N1;
+
+      // mix functions for k2
+      k2 *= C2;
+      k2 = Long.rotateLeft(k2, R3);
+      k2 *= C1;
+      h2 ^= k2;
+      h2 = Long.rotateLeft(h2, R1);
+      h2 += h1;
+      h2 = h2 * M + N2;
+    }
+
+    // tail
+    long k1 = 0;
+    long k2 = 0;
+    int tailStart = nblocks << 4;
+    switch (length - tailStart) {
+      case 15:
+        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+      case 14:
+        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+      case 13:
+        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+      case 12:
+        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+      case 11:
+        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+      case 10:
+        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+      case 9:
+        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
+        k2 *= C2;
+        k2 = Long.rotateLeft(k2, R3);
+        k2 *= C1;
+        h2 ^= k2;
+
+      case 8:
+        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+      case 7:
+        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= (long) (data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        h1 ^= k1;
+    }
+
+    // finalization
+    h1 ^= length;
+    h2 ^= length;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    return new long[]{h1, h2};
+  }
+
+  private static long fmix64(long h) {
+    h ^= (h >>> 33);
+    h *= 0xff51afd7ed558ccdL;
+    h ^= (h >>> 33);
+    h *= 0xc4ceb9fe1a85ec53L;
+    h ^= (h >>> 33);
+    return h;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java 
b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
-  @Test
-  public void testHashCodesM3_32_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_32(seed);
-    int hc1 = hf.hashBytes(key.getBytes()).asInt();
-    int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-
-    key = "testkey";
-    hc1 = hf.hashBytes(key.getBytes()).asInt();
-    hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-  }
-
-  @Test
-  public void testHashCodesM3_32_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_128(seed);
-    // guava stores the hashcodes in little endian order
-    ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    long gl1 = buf.getLong();
-    long gl2 = buf.getLong(8);
-    long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, 
seed);
-    long m1 = hc[0];
-    long m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    key = "testkey128_testkey128";
-    buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    gl1 = buf.getLong();
-    gl2 = buf.getLong(8);
-    byte[] keyBytes = key.getBytes();
-    hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
-    m1 = hc[0];
-    m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
-    Arrays.fill(offsetKeyBytes, (byte) -1);
-    System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
-    hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
-    assertEquals(gl1, hc[0]);
-    assertEquals(gl2, hc[1]);
-  }
-
-  @Test
-  public void testHashCodeM3_64() {
-    byte[] origin = ("It was the best of times, it was the worst of times," +
-        " it was the age of wisdom, it was the age of foolishness," +
-        " it was the epoch of belief, it was the epoch of incredulity," +
-        " it was the season of Light, it was the season of Darkness," +
-        " it was the spring of hope, it was the winter of despair," +
-        " we had everything before us, we had nothing before us," +
-        " we were all going direct to Heaven," +
-        " we were all going direct the other way.").getBytes();
-    long hash = Murmur3.hash64(origin, 0, origin.length);
-    assertEquals(305830725663368540L, hash);
-
-    byte[] originOffset = new byte[origin.length + 150];
-    Arrays.fill(originOffset, (byte) 123);
-    System.arraycopy(origin, 0, originOffset, 150, origin.length);
-    hash = Murmur3.hash64(originOffset, 150, origin.length);
-    assertEquals(305830725663368540L, hash);
-  }
-
-  @Test
-  public void testHashCodesM3_128_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-
-      byte[] offsetData = new byte[data.length + 50];
-      System.arraycopy(data, 0, offsetData, 50, data.length);
-      hc = Murmur3.hash128(offsetData, 50, data.length, seed);
-      assertEquals(gl1, hc[0]);
-      assertEquals(gl2, hc[1]);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 876070b..7206503 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -37,7 +37,8 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcFile;
@@ -383,7 +384,9 @@ public final class FileDump {
           StringBuilder buf = new StringBuilder();
           String rowIdxString = getFormattedRowIndices(col, 
indices.getRowGroupIndex());
           buf.append(rowIdxString);
-          String bloomFilString = getFormattedBloomFilters(col, 
indices.getBloomFilterIndex());
+          String bloomFilString = getFormattedBloomFilters(col, indices,
+              reader.getWriterVersion(),
+              reader.getSchema().findSubtype(col).getCategory());
           buf.append(bloomFilString);
           System.out.println(buf);
         }
@@ -604,15 +607,18 @@ public final class FileDump {
     return -1;
   }
 
-  private static String getFormattedBloomFilters(int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+  private static String getFormattedBloomFilters(int col, OrcIndex index,
+                                                 OrcFile.WriterVersion version,
+                                                 TypeDescription.Category 
type) {
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     StringBuilder buf = new StringBuilder();
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int idx = 0;
       buf.append("\n    Bloom filters for column ").append(col).append(":");
       for (OrcProto.BloomFilter bf : 
bloomFilterIndex[col].getBloomFilterList()) {
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         buf.append("\n      Entry 
").append(idx++).append(":").append(getBloomFilterStats(toMerge));
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -626,7 +632,7 @@ public final class FileDump {
     return buf.toString();
   }
 
-  private static String getBloomFilterStats(BloomFilterIO bf) {
+  private static String getBloomFilterStats(BloomFilter bf) {
     StringBuilder sb = new StringBuilder();
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java 
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index e2048ea..aa3072c 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -20,18 +20,20 @@ package org.apache.orc.tools;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
 import org.apache.orc.impl.AcidStats;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
 import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.BinaryColumnStatistics;
 import org.apache.orc.BooleanColumnStatistics;
 import org.apache.orc.ColumnStatistics;
@@ -50,12 +52,16 @@ import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
 import org.codehaus.jettison.json.JSONStringer;
 import org.codehaus.jettison.json.JSONWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * File dump tool with json formatted output.
  */
 public class JsonFileDump {
 
+  private static final Logger LOG = 
LoggerFactory.getLogger(JsonFileDump.class);
+
   public static void printJsonMetaData(List<String> files,
       Configuration conf,
       List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
@@ -185,7 +191,9 @@ public class JsonFileDump {
               writer.object();
               writer.key("columnId").value(col);
               writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
-              writeBloomFilterIndexes(writer, col, 
indices.getBloomFilterIndex());
+              writeBloomFilterIndexes(writer, col, indices,
+                  reader.getWriterVersion(),
+                  reader.getSchema().findSubtype(col).getCategory());
               writer.endObject();
             }
             writer.endArray();
@@ -334,16 +342,21 @@ public class JsonFileDump {
   }
 
   private static void writeBloomFilterIndexes(JSONWriter writer, int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+                                              OrcIndex index,
+                                              OrcFile.WriterVersion version,
+                                              TypeDescription.Category type
+                                              ) throws JSONException {
 
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int entryIx = 0;
       writer.key("bloomFilterIndexes").array();
       for (OrcProto.BloomFilter bf : 
bloomFilterIndex[col].getBloomFilterList()) {
         writer.object();
         writer.key("entryId").value(entryIx++);
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         writeBloomFilterStats(writer, toMerge);
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -362,7 +375,7 @@ public class JsonFileDump {
     }
   }
 
-  private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO 
bf)
+  private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf)
       throws JSONException {
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 10cc87d..65ff404 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -445,8 +445,9 @@ public class TestFileDump {
         .compress(CompressionKind.ZLIB)
         .bufferSize(10000)
         .rowIndexStride(1000)
-        .bloomFilterColumns("l")
-        .bloomFilterFpp(0.01);
+        .bloomFilterColumns("l,s")
+        .bloomFilterFpp(0.01)
+        .bloomFilterVersion(OrcFile.BloomFilterVersion.ORIGINAL);
     VectorizedRowBatch batch = schema.createRowBatch(1000);
     Writer writer = OrcFile.createWriter(testFilePath, options);
     Random r1 = new Random(1);
@@ -483,7 +484,6 @@ public class TestFileDump {
     System.out.flush();
     System.setOut(origOut);
 
-
     checkOutput(outputFilename, workDir + File.separator + outputFilename);
   }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/log4j.properties 
b/java/tools/src/test/resources/log4j.properties
new file mode 100644
index 0000000..8224baf
--- /dev/null
+++ b/java/tools/src/test/resources/log4j.properties
@@ -0,0 +1,21 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=WARN,stdout
+
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target   = System.err
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index 18fd2fb..b879bed 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,17 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 3 section ROW_INDEX start: 355 length 87
-    Stream: column 3 section BLOOM_FILTER start: 442 length 512
-    Stream: column 1 section DATA start: 954 length 20035
-    Stream: column 2 section DATA start: 20989 length 40050
-    Stream: column 3 section DATA start: 61039 length 3543
-    Stream: column 3 section LENGTH start: 64582 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304
+    Stream: column 1 section DATA start: 746 length 20035
+    Stream: column 2 section DATA start: 20781 length 40050
+    Stream: column 3 section DATA start: 60831 length 3543
+    Stream: column 3 section LENGTH start: 64374 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 64399 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
-    Stream: column 0 section ROW_INDEX start: 64826 length 17
-    Stream: column 1 section ROW_INDEX start: 64843 length 164
-    Stream: column 2 section ROW_INDEX start: 65007 length 168
-    Stream: column 3 section ROW_INDEX start: 65175 length 83
-    Stream: column 3 section BLOOM_FILTER start: 65258 length 512
-    Stream: column 1 section DATA start: 65770 length 20035
-    Stream: column 2 section DATA start: 85805 length 40050
-    Stream: column 3 section DATA start: 125855 length 3532
-    Stream: column 3 section LENGTH start: 129387 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+  Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736
+    Stream: column 0 section ROW_INDEX start: 64618 length 17
+    Stream: column 1 section ROW_INDEX start: 64635 length 164
+    Stream: column 2 section ROW_INDEX start: 64799 length 168
+    Stream: column 3 section ROW_INDEX start: 64967 length 83
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304
+    Stream: column 1 section DATA start: 65354 length 20035
+    Stream: column 2 section DATA start: 85389 length 40050
+    Stream: column 3 section DATA start: 125439 length 3532
+    Stream: column 3 section LENGTH start: 128971 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 128996 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
-    Stream: column 0 section ROW_INDEX start: 129631 length 17
-    Stream: column 1 section ROW_INDEX start: 129648 length 163
-    Stream: column 2 section ROW_INDEX start: 129811 length 168
-    Stream: column 3 section ROW_INDEX start: 129979 length 90
-    Stream: column 3 section BLOOM_FILTER start: 130069 length 512
-    Stream: column 1 section DATA start: 130581 length 20035
-    Stream: column 2 section DATA start: 150616 length 40050
-    Stream: column 3 section DATA start: 190666 length 3544
-    Stream: column 3 section LENGTH start: 194210 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+  Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742
+    Stream: column 0 section ROW_INDEX start: 129215 length 17
+    Stream: column 1 section ROW_INDEX start: 129232 length 163
+    Stream: column 2 section ROW_INDEX start: 129395 length 168
+    Stream: column 3 section ROW_INDEX start: 129563 length 90
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304
+    Stream: column 1 section DATA start: 129957 length 20035
+    Stream: column 2 section DATA start: 149992 length 40050
+    Stream: column 3 section DATA start: 190042 length 3544
+    Stream: column 3 section LENGTH start: 193586 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 193611 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
-    Stream: column 0 section ROW_INDEX start: 194454 length 17
-    Stream: column 1 section ROW_INDEX start: 194471 length 165
-    Stream: column 2 section ROW_INDEX start: 194636 length 167
-    Stream: column 3 section ROW_INDEX start: 194803 length 91
-    Stream: column 3 section BLOOM_FILTER start: 194894 length 512
-    Stream: column 1 section DATA start: 195406 length 20035
-    Stream: column 2 section DATA start: 215441 length 40050
-    Stream: column 3 section DATA start: 255491 length 3574
-    Stream: column 3 section LENGTH start: 259065 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+  Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744
+    Stream: column 0 section ROW_INDEX start: 193830 length 17
+    Stream: column 1 section ROW_INDEX start: 193847 length 165
+    Stream: column 2 section ROW_INDEX start: 194012 length 167
+    Stream: column 3 section ROW_INDEX start: 194179 length 91
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304
+    Stream: column 1 section DATA start: 194574 length 20035
+    Stream: column 2 section DATA start: 214609 length 40050
+    Stream: column 3 section DATA start: 254659 length 3574
+    Stream: column 3 section LENGTH start: 258233 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 258258 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
-    Stream: column 0 section ROW_INDEX start: 259309 length 12
-    Stream: column 1 section ROW_INDEX start: 259321 length 38
-    Stream: column 2 section ROW_INDEX start: 259359 length 41
-    Stream: column 3 section ROW_INDEX start: 259400 length 40
-    Stream: column 3 section BLOOM_FILTER start: 259440 length 301
-    Stream: column 1 section DATA start: 259741 length 4007
-    Stream: column 2 section DATA start: 263748 length 8010
-    Stream: column 3 section DATA start: 271758 length 768
-    Stream: column 3 section LENGTH start: 272526 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+  Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382
+    Stream: column 0 section ROW_INDEX start: 258476 length 12
+    Stream: column 1 section ROW_INDEX start: 258488 length 38
+    Stream: column 2 section ROW_INDEX start: 258526 length 41
+    Stream: column 3 section ROW_INDEX start: 258567 length 40
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251
+    Stream: column 1 section DATA start: 258858 length 4007
+    Stream: column 2 section DATA start: 262865 length 8010
+    Stream: column 3 section DATA start: 270875 length 768
+    Stream: column 3 section LENGTH start: 271643 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 271668 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
       Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
 
-File length: 273307 bytes
+File length: 272427 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index fa5cc2d..75cd5f4 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,20 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 2 section BLOOM_FILTER start: 355 length 6535
-    Stream: column 3 section ROW_INDEX start: 6890 length 87
-    Stream: column 1 section DATA start: 6977 length 20035
-    Stream: column 2 section DATA start: 27012 length 40050
-    Stream: column 3 section DATA start: 67062 length 3543
-    Stream: column 3 section LENGTH start: 70605 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
+    Stream: column 3 section ROW_INDEX start: 12936 length 87
+    Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892
+    Stream: column 1 section DATA start: 14953 length 20035
+    Stream: column 2 section DATA start: 34988 length 40050
+    Stream: column 3 section DATA start: 75038 length 3543
+    Stream: column 3 section LENGTH start: 78581 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 78606 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +70,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 
0.5178 expectedFpp: 0.009981772
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 
0.5155 expectedFpp: 0.009676614
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 
loadFactor: 0.9736 expectedFpp: 0.829482
-  Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
-    Stream: column 0 section ROW_INDEX start: 70848 length 17
-    Stream: column 1 section ROW_INDEX start: 70865 length 164
-    Stream: column 2 section ROW_INDEX start: 71029 length 168
-    Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
-    Stream: column 3 section ROW_INDEX start: 77730 length 83
-    Stream: column 1 section DATA start: 77813 length 20035
-    Stream: column 2 section DATA start: 97848 length 40050
-    Stream: column 3 section DATA start: 137898 length 3532
-    Stream: column 3 section LENGTH start: 141430 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+  Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941
+    Stream: column 0 section ROW_INDEX start: 78843 length 17
+    Stream: column 1 section ROW_INDEX start: 78860 length 164
+    Stream: column 2 section ROW_INDEX start: 79024 length 168
+    Stream: column 2 section BLOOM_FILTER start: 79192 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046
+    Stream: column 3 section ROW_INDEX start: 91771 length 83
+    Stream: column 3 section BLOOM_FILTER start: 91854 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892
+    Stream: column 1 section DATA start: 93784 length 20035
+    Stream: column 2 section DATA start: 113819 length 40050
+    Stream: column 3 section DATA start: 153869 length 3532
+    Stream: column 3 section LENGTH start: 157401 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 157426 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +101,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 
0.5169 expectedFpp: 0.009855959
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 
0.5173 expectedFpp: 0.009911705
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 
loadFactor: 0.9733 expectedFpp: 0.8276205
-  Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
-    Stream: column 0 section ROW_INDEX start: 141673 length 17
-    Stream: column 1 section ROW_INDEX start: 141690 length 163
-    Stream: column 2 section ROW_INDEX start: 141853 length 168
-    Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
-    Stream: column 3 section ROW_INDEX start: 148554 length 90
-    Stream: column 1 section DATA start: 148644 length 20035
-    Stream: column 2 section DATA start: 168679 length 40050
-    Stream: column 3 section DATA start: 208729 length 3544
-    Stream: column 3 section LENGTH start: 212273 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+  Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947
+    Stream: column 0 section ROW_INDEX start: 157662 length 17
+    Stream: column 1 section ROW_INDEX start: 157679 length 163
+    Stream: column 2 section ROW_INDEX start: 157842 length 168
+    Stream: column 2 section BLOOM_FILTER start: 158010 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046
+    Stream: column 3 section ROW_INDEX start: 170589 length 90
+    Stream: column 3 section BLOOM_FILTER start: 170679 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892
+    Stream: column 1 section DATA start: 172609 length 20035
+    Stream: column 2 section DATA start: 192644 length 40050
+    Stream: column 3 section DATA start: 232694 length 3544
+    Stream: column 3 section LENGTH start: 236238 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 236263 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +132,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 
0.5149 expectedFpp: 0.009594797
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 
0.5135 expectedFpp: 0.009419539
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 
loadFactor: 0.9722 expectedFpp: 0.82082444
-  Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
-    Stream: column 0 section ROW_INDEX start: 212516 length 17
-    Stream: column 1 section ROW_INDEX start: 212533 length 165
-    Stream: column 2 section ROW_INDEX start: 212698 length 167
-    Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
-    Stream: column 3 section ROW_INDEX start: 219389 length 91
-    Stream: column 1 section DATA start: 219480 length 20035
-    Stream: column 2 section DATA start: 239515 length 40050
-    Stream: column 3 section DATA start: 279565 length 3574
-    Stream: column 3 section LENGTH start: 283139 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+  Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940
+    Stream: column 0 section ROW_INDEX start: 236500 length 17
+    Stream: column 1 section ROW_INDEX start: 236517 length 165
+    Stream: column 2 section ROW_INDEX start: 236682 length 167
+    Stream: column 2 section BLOOM_FILTER start: 236849 length 6524
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046
+    Stream: column 3 section ROW_INDEX start: 249419 length 91
+    Stream: column 3 section BLOOM_FILTER start: 249510 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892
+    Stream: column 1 section DATA start: 251440 length 20035
+    Stream: column 2 section DATA start: 271475 length 40050
+    Stream: column 3 section DATA start: 311525 length 3574
+    Stream: column 3 section LENGTH start: 315099 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 315124 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +163,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 
0.5147 expectedFpp: 0.009567649
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 
0.5201 expectedFpp: 0.010295142
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 
loadFactor: 0.9743 expectedFpp: 0.8332165
-  Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
-    Stream: column 0 section ROW_INDEX start: 283382 length 12
-    Stream: column 1 section ROW_INDEX start: 283394 length 38
-    Stream: column 2 section ROW_INDEX start: 283432 length 41
-    Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
-    Stream: column 3 section ROW_INDEX start: 284810 length 40
-    Stream: column 1 section DATA start: 284850 length 4007
-    Stream: column 2 section DATA start: 288857 length 8010
-    Stream: column 3 section DATA start: 296867 length 768
-    Stream: column 3 section LENGTH start: 297635 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+  Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542
+    Stream: column 0 section ROW_INDEX start: 315360 length 12
+    Stream: column 1 section ROW_INDEX start: 315372 length 38
+    Stream: column 2 section ROW_INDEX start: 315410 length 41
+    Stream: column 2 section BLOOM_FILTER start: 315451 length 1337
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211
+    Stream: column 3 section ROW_INDEX start: 317999 length 40
+    Stream: column 3 section BLOOM_FILTER start: 318039 length 472
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391
+    Stream: column 1 section DATA start: 318902 length 4007
+    Stream: column 2 section DATA start: 322909 length 8010
+    Stream: column 3 section DATA start: 330919 length 768
+    Stream: column 3 section LENGTH start: 331687 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 331712 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +187,7 @@ Stripes:
       Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 
0.5154 expectedFpp: 0.00966294
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 
loadFactor: 0.5154 expectedFpp: 0.00966294
 
-File length: 298416 bytes
+File length: 332489 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git 
a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out 
b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 17a964b..4b0822f 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json 
b/java/tools/src/test/resources/orc-file-dump.json
index bf654a1..3dd0dc0 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1,7 +1,7 @@
 {
   "fileName": "TestFileDump.testDump.orc",
   "fileVersion": "0.12",
-  "writerVersion": "HIVE_13083",
+  "writerVersion": "ORC_101",
   "numberOfRows": 21000,
   "compression": "ZLIB",
   "compressionBufferSize": 4096,
@@ -254,9 +254,9 @@
       "stripeNumber": 1,
       "stripeInformation": {
         "offset": 3,
-        "indexLength": 970,
+        "indexLength": 762,
         "dataLength": 63770,
-        "footerLength": 90,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
@@ -286,44 +286,44 @@
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
+          "section": "BLOOM_FILTER_UTF8",
           "startOffset": 461,
-          "length": 512
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 973,
+          "startOffset": 765,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 21008,
+          "startOffset": 20800,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 61058,
+          "startOffset": 60850,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 61075,
+          "startOffset": 60867,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 64585,
+          "startOffset": 64377,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 64610,
+          "startOffset": 64402,
           "length": 133
         }
       ],
@@ -494,77 +494,77 @@
     {
       "stripeNumber": 2,
       "stripeInformation": {
-        "offset": 64833,
-        "indexLength": 961,
+        "offset": 64624,
+        "indexLength": 753,
         "dataLength": 63763,
-        "footerLength": 88,
+        "footerLength": 87,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 64833,
+          "startOffset": 64624,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 64850,
+          "startOffset": 64641,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 65016,
+          "startOffset": 64807,
           "length": 166
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 65182,
+          "startOffset": 64973,
           "length": 100
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 65282,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 65073,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 65794,
+          "startOffset": 65377,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 85829,
+          "startOffset": 85412,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 125879,
+          "startOffset": 125462,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 125896,
+          "startOffset": 125479,
           "length": 3503
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 129399,
+          "startOffset": 128982,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 129424,
+          "startOffset": 129007,
           "length": 133
         }
       ],
@@ -735,77 +735,77 @@
     {
       "stripeNumber": 3,
       "stripeInformation": {
-        "offset": 129645,
-        "indexLength": 962,
+        "offset": 129227,
+        "indexLength": 754,
         "dataLength": 63770,
-        "footerLength": 91,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 129645,
+          "startOffset": 129227,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 129662,
+          "startOffset": 129244,
           "length": 164
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 129826,
+          "startOffset": 129408,
           "length": 167
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 129993,
+          "startOffset": 129575,
           "length": 102
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 130095,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 129677,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 130607,
+          "startOffset": 129981,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 150642,
+          "startOffset": 150016,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 190692,
+          "startOffset": 190066,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 190709,
+          "startOffset": 190083,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 194219,
+          "startOffset": 193593,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 194244,
+          "startOffset": 193618,
           "length": 133
         }
       ],
@@ -976,77 +976,77 @@
     {
       "stripeNumber": 4,
       "stripeInformation": {
-        "offset": 194468,
-        "indexLength": 973,
+        "offset": 193840,
+        "indexLength": 765,
         "dataLength": 63756,
-        "footerLength": 91,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 194468,
+          "startOffset": 193840,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 194485,
+          "startOffset": 193857,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 194651,
+          "startOffset": 194023,
           "length": 171
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 194822,
+          "startOffset": 194194,
           "length": 107
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 194929,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 194301,
+          "length": 304
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 195441,
+          "startOffset": 194605,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 215476,
+          "startOffset": 214640,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 255526,
+          "startOffset": 254690,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 255543,
+          "startOffset": 254707,
           "length": 3496
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 259039,
+          "startOffset": 258203,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 259064,
+          "startOffset": 258228,
           "length": 133
         }
       ],
@@ -1217,8 +1217,8 @@
     {
       "stripeNumber": 5,
       "stripeInformation": {
-        "offset": 259288,
-        "indexLength": 433,
+        "offset": 258450,
+        "indexLength": 383,
         "dataLength": 12943,
         "footerLength": 83,
         "rowCount": 1000
@@ -1227,67 +1227,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 259288,
+          "startOffset": 258450,
           "length": 12
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 259300,
+          "startOffset": 258462,
           "length": 38
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 259338,
+          "startOffset": 258500,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 259379,
+          "startOffset": 258541,
           "length": 41
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 259420,
-          "length": 301
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 258582,
+          "length": 251
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 259721,
+          "startOffset": 258833,
           "length": 4007
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 263728,
+          "startOffset": 262840,
           "length": 8010
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 271738,
+          "startOffset": 270850,
           "length": 16
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 271754,
+          "startOffset": 270866,
           "length": 752
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 272506,
+          "startOffset": 271618,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 272531,
+          "startOffset": 271643,
           "length": 133
         }
       ],
@@ -1348,7 +1348,7 @@
       }]
     }
   ],
-  "fileLength": 273300,
+  "fileLength": 272409,
   "paddingLength": 0,
   "paddingRatio": 0,
   "status": "OK"

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.out 
b/java/tools/src/test/resources/orc-file-dump.out
index 70f7fbd..ae8195e 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-has-null.out 
b/java/tools/src/test/resources/orc-file-has-null.out
index df075d5..c02f803 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 20000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index dbc34ab..de6974e 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -91,6 +91,7 @@ message RowIndex {
 message BloomFilter {
   optional uint32 numHashFunctions = 1;
   repeated fixed64 bitset = 2;
+  optional bytes utf8bitset = 3;
 }
 
 message BloomFilterIndex {
@@ -109,6 +110,7 @@ message Stream {
     SECONDARY = 5;
     ROW_INDEX = 6;
     BLOOM_FILTER = 7;
+    BLOOM_FILTER_UTF8 = 8;
   }
   optional Kind kind = 1;
   optional uint32 column = 2;

[1/4] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

Reply via email to