Repository: orc
Updated Branches:
  refs/heads/master 604dcc801 -> 833cc0e73 (forced update)


http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java 
b/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
deleted file mode 100644
index e60690d..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * BloomFilter is a probabilistic data structure for set membership check. 
BloomFilters are
- * highly space efficient when compared to using a HashSet. Because of the 
probabilistic nature of
- * bloom filter false positive (element not present in bloom filter but test() 
says true) are
- * possible but false negatives are not possible (if element is present then 
test() will never
- * say false). The false positive probability is configurable (default: 5%) 
depending on which
- * storage requirement may increase or decrease. Lower the false positive 
probability greater
- * is the space requirement.
- * Bloom filters are sensitive to number of elements that will be inserted in 
the bloom filter.
- * During the creation of bloom filter expected number of entries must be 
specified. If the number
- * of insertions exceed the specified initial number of entries then false 
positive probability will
- * increase accordingly.
- *
- * Internally, this implementation of bloom filter uses Murmur3 fast 
non-cryptographic hash
- * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it 
suffers from hash
- * collisions for specific sequence of repeating bytes. Check the following 
link for more info
- * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
- */
-public class BloomFilter {
-  public static final double DEFAULT_FPP = 0.05;
-  protected BitSet bitSet;
-  protected int numBits;
-  protected int numHashFunctions;
-
-  public BloomFilter() {
-  }
-
-  public BloomFilter(long expectedEntries) {
-    this(expectedEntries, DEFAULT_FPP);
-  }
-
-  static void checkArgument(boolean expression, String message) {
-    if (!expression) {
-      throw new IllegalArgumentException(message);
-    }
-  }
-
-  public BloomFilter(long expectedEntries, double fpp) {
-    checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
-    checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should 
be > 0.0 & < 1.0");
-    int nb = optimalNumOfBits(expectedEntries, fpp);
-    // make 'm' multiple of 64
-    this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
-    this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, 
numBits);
-    this.bitSet = new BitSet(numBits);
-  }
-
-  /**
-   * A constructor to support rebuilding the BloomFilter from a serialized 
representation.
-   * @param bits
-   * @param numBits
-   * @param numFuncs
-   */
-  public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
-    super();
-    long[] copied = new long[bits.size()];
-    for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
-    bitSet = new BitSet(copied);
-    this.numBits = numBits;
-    numHashFunctions = numFuncs;
-  }
-
-  static int optimalNumOfHashFunctions(long n, long m) {
-    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
-  }
-
-  static int optimalNumOfBits(long n, double p) {
-    return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
-  }
-
-  public void add(byte[] val) {
-    if (val == null) {
-      addBytes(val, -1, -1);
-    } else {
-      addBytes(val, 0, val.length);
-    }
-  }
-
-  public void addBytes(byte[] val, int offset, int length) {
-    // We use the trick mentioned in "Less Hashing, Same Performance: Building 
a Better Bloom Filter"
-    // by Kirsch et.al. From abstract 'only two hash functions are necessary 
to effectively
-    // implement a Bloom filter without any loss in the asymptotic false 
positive probability'
-
-    // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the 
technique mentioned
-    // in the above paper
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    addHash(hash64);
-  }
-
-  private void addHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      bitSet.set(pos);
-    }
-  }
-
-  public void addString(String val) {
-    if (val == null) {
-      add(null);
-    } else {
-      add(val.getBytes());
-    }
-  }
-
-  public void addLong(long val) {
-    addHash(getLongHash(val));
-  }
-
-  public void addDouble(double val) {
-    addLong(Double.doubleToLongBits(val));
-  }
-
-  public boolean test(byte[] val) {
-    if (val == null) {
-      return testBytes(val, -1, -1);
-    }
-    return testBytes(val, 0, val.length);
-  }
-
-  public boolean testBytes(byte[] val, int offset, int length) {
-    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
-        Murmur3.hash64(val, offset, length);
-    return testHash(hash64);
-  }
-
-  private boolean testHash(long hash64) {
-    int hash1 = (int) hash64;
-    int hash2 = (int) (hash64 >>> 32);
-
-    for (int i = 1; i <= numHashFunctions; i++) {
-      int combinedHash = hash1 + (i * hash2);
-      // hashcode should be positive, flip all the bits if it's negative
-      if (combinedHash < 0) {
-        combinedHash = ~combinedHash;
-      }
-      int pos = combinedHash % numBits;
-      if (!bitSet.get(pos)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  public boolean testString(String val) {
-    if (val == null) {
-      return test(null);
-    } else {
-      return test(val.getBytes());
-    }
-  }
-
-  public boolean testLong(long val) {
-    return testHash(getLongHash(val));
-  }
-
-  // Thomas Wang's integer hash function
-  // 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
-  private long getLongHash(long key) {
-    key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-    key = key ^ (key >> 24);
-    key = (key + (key << 3)) + (key << 8); // key * 265
-    key = key ^ (key >> 14);
-    key = (key + (key << 2)) + (key << 4); // key * 21
-    key = key ^ (key >> 28);
-    key = key + (key << 31);
-    return key;
-  }
-
-  public boolean testDouble(double val) {
-    return testLong(Double.doubleToLongBits(val));
-  }
-
-  public long sizeInBytes() {
-    return getBitSize() / 8;
-  }
-
-  public int getBitSize() {
-    return bitSet.getData().length * Long.SIZE;
-  }
-
-  public int getNumHashFunctions() {
-    return numHashFunctions;
-  }
-
-  public long[] getBitSet() {
-    return bitSet.getData();
-  }
-
-  @Override
-  public String toString() {
-    return "m: " + numBits + " k: " + numHashFunctions;
-  }
-
-  /**
-   * Merge the specified bloom filter with current bloom filter.
-   *
-   * @param that - bloom filter to merge
-   */
-  public void merge(BloomFilter that) {
-    if (this != that && this.numBits == that.numBits && this.numHashFunctions 
== that.numHashFunctions) {
-      this.bitSet.putAll(that.bitSet);
-    } else {
-      throw new IllegalArgumentException("BloomFilters are not compatible for 
merging." +
-          " this - " + this.toString() + " that - " + that.toString());
-    }
-  }
-
-  public void reset() {
-    this.bitSet.clear();
-  }
-
-  /**
-   * Bare metal bit set implementation. For performance reasons, this 
implementation does not check
-   * for index bounds nor expand the bit set size if the specified index is 
greater than the size.
-   */
-  public class BitSet {
-    private final long[] data;
-
-    public BitSet(long bits) {
-      this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
-    }
-
-    /**
-     * Deserialize long array as bit set.
-     *
-     * @param data - bit array
-     */
-    public BitSet(long[] data) {
-      assert data.length > 0 : "data length is zero!";
-      this.data = data;
-    }
-
-    /**
-     * Sets the bit at specified index.
-     *
-     * @param index - position
-     */
-    public void set(int index) {
-      data[index >>> 6] |= (1L << index);
-    }
-
-    /**
-     * Returns true if the bit is set in the specified index.
-     *
-     * @param index - position
-     * @return - value at the bit position
-     */
-    public boolean get(int index) {
-      return (data[index >>> 6] & (1L << index)) != 0;
-    }
-
-    /**
-     * Number of bits
-     */
-    public long bitSize() {
-      return (long) data.length * Long.SIZE;
-    }
-
-    public long[] getData() {
-      return data;
-    }
-
-    /**
-     * Combines the two BitArrays using bitwise OR.
-     */
-    public void putAll(BitSet array) {
-      assert data.length == array.data.length :
-          "BitArrays must be of equal length (" + data.length + "!= " + 
array.data.length + ")";
-      for (int i = 0; i < data.length; i++) {
-        data[i] |= array.data[i];
-      }
-    }
-
-    /**
-     * Clear the bit set.
-     */
-    public void clear() {
-      Arrays.fill(data, 0);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java 
b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
deleted file mode 100644
index 88c3514..0000000
--- a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+++ /dev/null
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-/**
- * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
- *
- * Murmur3 32 and 128 bit variants.
- * 32-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
- *
- * This is a public domain code with no copyrights.
- * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
- * "All MurmurHash versions are public domain software, and the author 
disclaims all copyright
- * to their code."
- */
-public class Murmur3 {
-  // from 64-bit linear congruential generator
-  public static final long NULL_HASHCODE = 2862933555777941757L;
-
-  // Constants for 32 bit variant
-  private static final int C1_32 = 0xcc9e2d51;
-  private static final int C2_32 = 0x1b873593;
-  private static final int R1_32 = 15;
-  private static final int R2_32 = 13;
-  private static final int M_32 = 5;
-  private static final int N_32 = 0xe6546b64;
-
-  // Constants for 128 bit variant
-  private static final long C1 = 0x87c37b91114253d5L;
-  private static final long C2 = 0x4cf5ad432745937fL;
-  private static final int R1 = 31;
-  private static final int R2 = 27;
-  private static final int R3 = 33;
-  private static final int M = 5;
-  private static final int N1 = 0x52dce729;
-  private static final int N2 = 0x38495ab5;
-
-  private static final int DEFAULT_SEED = 104729;
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data) {
-    return hash32(data, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 32-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default 0)
-   * @return - hashcode
-   */
-  public static int hash32(byte[] data, int length, int seed) {
-    int hash = seed;
-    final int nblocks = length >> 2;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      int i_4 = i << 2;
-      int k = (data[i_4] & 0xff)
-          | ((data[i_4 + 1] & 0xff) << 8)
-          | ((data[i_4 + 2] & 0xff) << 16)
-          | ((data[i_4 + 3] & 0xff) << 24);
-
-      // mix functions
-      k *= C1_32;
-      k = Integer.rotateLeft(k, R1_32);
-      k *= C2_32;
-      hash ^= k;
-      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
-    }
-
-    // tail
-    int idx = nblocks << 2;
-    int k1 = 0;
-    switch (length - idx) {
-      case 3:
-        k1 ^= data[idx + 2] << 16;
-      case 2:
-        k1 ^= data[idx + 1] << 8;
-      case 1:
-        k1 ^= data[idx];
-
-        // mix functions
-        k1 *= C1_32;
-        k1 = Integer.rotateLeft(k1, R1_32);
-        k1 *= C2_32;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash ^= (hash >>> 16);
-    hash *= 0x85ebca6b;
-    hash ^= (hash >>> 13);
-    hash *= 0xc2b2ae35;
-    hash ^= (hash >>> 16);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data) {
-    return hash64(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  public static long hash64(byte[] data, int offset, int length) {
-    return hash64(data, offset, length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode
-   */
-  public static long hash64(byte[] data, int offset, int length, int seed) {
-    long hash = seed;
-    final int nblocks = length >> 3;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i8 = i << 3;
-      long k = ((long) data[offset + i8] & 0xff)
-          | (((long) data[offset + i8 + 1] & 0xff) << 8)
-          | (((long) data[offset + i8 + 2] & 0xff) << 16)
-          | (((long) data[offset + i8 + 3] & 0xff) << 24)
-          | (((long) data[offset + i8 + 4] & 0xff) << 32)
-          | (((long) data[offset + i8 + 5] & 0xff) << 40)
-          | (((long) data[offset + i8 + 6] & 0xff) << 48)
-          | (((long) data[offset + i8 + 7] & 0xff) << 56);
-
-      // mix functions
-      k *= C1;
-      k = Long.rotateLeft(k, R1);
-      k *= C2;
-      hash ^= k;
-      hash = Long.rotateLeft(hash, R2) * M + N1;
-    }
-
-    // tail
-    long k1 = 0;
-    int tailStart = nblocks << 3;
-    switch (length - tailStart) {
-      case 7:
-        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= ((long) data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        hash ^= k1;
-    }
-
-    // finalization
-    hash ^= length;
-    hash = fmix64(hash);
-
-    return hash;
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data - input byte array
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data) {
-    return hash128(data, 0, data.length, DEFAULT_SEED);
-  }
-
-  /**
-   * Murmur3 128-bit variant.
-   *
-   * @param data   - input byte array
-   * @param offset - the first element of array
-   * @param length - length of array
-   * @param seed   - seed. (default is 0)
-   * @return - hashcode (2 longs)
-   */
-  public static long[] hash128(byte[] data, int offset, int length, int seed) {
-    long h1 = seed;
-    long h2 = seed;
-    final int nblocks = length >> 4;
-
-    // body
-    for (int i = 0; i < nblocks; i++) {
-      final int i16 = i << 4;
-      long k1 = ((long) data[offset + i16] & 0xff)
-          | (((long) data[offset + i16 + 1] & 0xff) << 8)
-          | (((long) data[offset + i16 + 2] & 0xff) << 16)
-          | (((long) data[offset + i16 + 3] & 0xff) << 24)
-          | (((long) data[offset + i16 + 4] & 0xff) << 32)
-          | (((long) data[offset + i16 + 5] & 0xff) << 40)
-          | (((long) data[offset + i16 + 6] & 0xff) << 48)
-          | (((long) data[offset + i16 + 7] & 0xff) << 56);
-
-      long k2 = ((long) data[offset + i16 + 8] & 0xff)
-          | (((long) data[offset + i16 + 9] & 0xff) << 8)
-          | (((long) data[offset + i16 + 10] & 0xff) << 16)
-          | (((long) data[offset + i16 + 11] & 0xff) << 24)
-          | (((long) data[offset + i16 + 12] & 0xff) << 32)
-          | (((long) data[offset + i16 + 13] & 0xff) << 40)
-          | (((long) data[offset + i16 + 14] & 0xff) << 48)
-          | (((long) data[offset + i16 + 15] & 0xff) << 56);
-
-      // mix functions for k1
-      k1 *= C1;
-      k1 = Long.rotateLeft(k1, R1);
-      k1 *= C2;
-      h1 ^= k1;
-      h1 = Long.rotateLeft(h1, R2);
-      h1 += h2;
-      h1 = h1 * M + N1;
-
-      // mix functions for k2
-      k2 *= C2;
-      k2 = Long.rotateLeft(k2, R3);
-      k2 *= C1;
-      h2 ^= k2;
-      h2 = Long.rotateLeft(h2, R1);
-      h2 += h1;
-      h2 = h2 * M + N2;
-    }
-
-    // tail
-    long k1 = 0;
-    long k2 = 0;
-    int tailStart = nblocks << 4;
-    switch (length - tailStart) {
-      case 15:
-        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
-      case 14:
-        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
-      case 13:
-        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
-      case 12:
-        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
-      case 11:
-        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
-      case 10:
-        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
-      case 9:
-        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
-        k2 *= C2;
-        k2 = Long.rotateLeft(k2, R3);
-        k2 *= C1;
-        h2 ^= k2;
-
-      case 8:
-        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
-      case 7:
-        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
-      case 6:
-        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
-      case 5:
-        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
-      case 4:
-        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
-      case 3:
-        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
-      case 2:
-        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
-      case 1:
-        k1 ^= (long) (data[offset + tailStart] & 0xff);
-        k1 *= C1;
-        k1 = Long.rotateLeft(k1, R1);
-        k1 *= C2;
-        h1 ^= k1;
-    }
-
-    // finalization
-    h1 ^= length;
-    h2 ^= length;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return new long[]{h1, h2};
-  }
-
-  private static long fmix64(long h) {
-    h ^= (h >>> 33);
-    h *= 0xff51afd7ed558ccdL;
-    h ^= (h >>> 33);
-    h *= 0xc4ceb9fe1a85ec53L;
-    h ^= (h >>> 33);
-    return h;
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
----------------------------------------------------------------------
diff --git a/java/storage-api/src/java/org/apache/orc/util/Murmur3.java 
b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
new file mode 100644
index 0000000..838681c
--- /dev/null
+++ b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+/**
+ * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms.
+ *
+ * Murmur3 32 and 128 bit variants.
+ * 32-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
+ * 128-bit Java port of 
https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ *
+ * This is a public domain code with no copyrights.
+ * From homepage of MurmurHash (https://code.google.com/p/smhasher/),
+ * "All MurmurHash versions are public domain software, and the author 
disclaims all copyright
+ * to their code."
+ */
+public class Murmur3 {
+  // from 64-bit linear congruential generator
+  public static final long NULL_HASHCODE = 2862933555777941757L;
+
+  // Constants for 32 bit variant
+  private static final int C1_32 = 0xcc9e2d51;
+  private static final int C2_32 = 0x1b873593;
+  private static final int R1_32 = 15;
+  private static final int R2_32 = 13;
+  private static final int M_32 = 5;
+  private static final int N_32 = 0xe6546b64;
+
+  // Constants for 128 bit variant
+  private static final long C1 = 0x87c37b91114253d5L;
+  private static final long C2 = 0x4cf5ad432745937fL;
+  private static final int R1 = 31;
+  private static final int R2 = 27;
+  private static final int R3 = 33;
+  private static final int M = 5;
+  private static final int N1 = 0x52dce729;
+  private static final int N2 = 0x38495ab5;
+
+  private static final int DEFAULT_SEED = 104729;
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data) {
+    return hash32(data, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 32-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default 0)
+   * @return - hashcode
+   */
+  public static int hash32(byte[] data, int length, int seed) {
+    int hash = seed;
+    final int nblocks = length >> 2;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      int i_4 = i << 2;
+      int k = (data[i_4] & 0xff)
+          | ((data[i_4 + 1] & 0xff) << 8)
+          | ((data[i_4 + 2] & 0xff) << 16)
+          | ((data[i_4 + 3] & 0xff) << 24);
+
+      // mix functions
+      k *= C1_32;
+      k = Integer.rotateLeft(k, R1_32);
+      k *= C2_32;
+      hash ^= k;
+      hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+    }
+
+    // tail
+    int idx = nblocks << 2;
+    int k1 = 0;
+    switch (length - idx) {
+      case 3:
+        k1 ^= data[idx + 2] << 16;
+      case 2:
+        k1 ^= data[idx + 1] << 8;
+      case 1:
+        k1 ^= data[idx];
+
+        // mix functions
+        k1 *= C1_32;
+        k1 = Integer.rotateLeft(k1, R1_32);
+        k1 *= C2_32;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash ^= (hash >>> 16);
+    hash *= 0x85ebca6b;
+    hash ^= (hash >>> 13);
+    hash *= 0xc2b2ae35;
+    hash ^= (hash >>> 16);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data) {
+    return hash64(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  public static long hash64(byte[] data, int offset, int length) {
+    return hash64(data, offset, length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode
+   */
+  public static long hash64(byte[] data, int offset, int length, int seed) {
+    long hash = seed;
+    final int nblocks = length >> 3;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i8 = i << 3;
+      long k = ((long) data[offset + i8] & 0xff)
+          | (((long) data[offset + i8 + 1] & 0xff) << 8)
+          | (((long) data[offset + i8 + 2] & 0xff) << 16)
+          | (((long) data[offset + i8 + 3] & 0xff) << 24)
+          | (((long) data[offset + i8 + 4] & 0xff) << 32)
+          | (((long) data[offset + i8 + 5] & 0xff) << 40)
+          | (((long) data[offset + i8 + 6] & 0xff) << 48)
+          | (((long) data[offset + i8 + 7] & 0xff) << 56);
+
+      // mix functions
+      k *= C1;
+      k = Long.rotateLeft(k, R1);
+      k *= C2;
+      hash ^= k;
+      hash = Long.rotateLeft(hash, R2) * M + N1;
+    }
+
+    // tail
+    long k1 = 0;
+    int tailStart = nblocks << 3;
+    switch (length - tailStart) {
+      case 7:
+        k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= ((long) data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        hash ^= k1;
+    }
+
+    // finalization
+    hash ^= length;
+    hash = fmix64(hash);
+
+    return hash;
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data - input byte array
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data) {
+    return hash128(data, 0, data.length, DEFAULT_SEED);
+  }
+
+  /**
+   * Murmur3 128-bit variant.
+   *
+   * @param data   - input byte array
+   * @param offset - the first element of array
+   * @param length - length of array
+   * @param seed   - seed. (default is 0)
+   * @return - hashcode (2 longs)
+   */
+  public static long[] hash128(byte[] data, int offset, int length, int seed) {
+    long h1 = seed;
+    long h2 = seed;
+    final int nblocks = length >> 4;
+
+    // body
+    for (int i = 0; i < nblocks; i++) {
+      final int i16 = i << 4;
+      long k1 = ((long) data[offset + i16] & 0xff)
+          | (((long) data[offset + i16 + 1] & 0xff) << 8)
+          | (((long) data[offset + i16 + 2] & 0xff) << 16)
+          | (((long) data[offset + i16 + 3] & 0xff) << 24)
+          | (((long) data[offset + i16 + 4] & 0xff) << 32)
+          | (((long) data[offset + i16 + 5] & 0xff) << 40)
+          | (((long) data[offset + i16 + 6] & 0xff) << 48)
+          | (((long) data[offset + i16 + 7] & 0xff) << 56);
+
+      long k2 = ((long) data[offset + i16 + 8] & 0xff)
+          | (((long) data[offset + i16 + 9] & 0xff) << 8)
+          | (((long) data[offset + i16 + 10] & 0xff) << 16)
+          | (((long) data[offset + i16 + 11] & 0xff) << 24)
+          | (((long) data[offset + i16 + 12] & 0xff) << 32)
+          | (((long) data[offset + i16 + 13] & 0xff) << 40)
+          | (((long) data[offset + i16 + 14] & 0xff) << 48)
+          | (((long) data[offset + i16 + 15] & 0xff) << 56);
+
+      // mix functions for k1
+      k1 *= C1;
+      k1 = Long.rotateLeft(k1, R1);
+      k1 *= C2;
+      h1 ^= k1;
+      h1 = Long.rotateLeft(h1, R2);
+      h1 += h2;
+      h1 = h1 * M + N1;
+
+      // mix functions for k2
+      k2 *= C2;
+      k2 = Long.rotateLeft(k2, R3);
+      k2 *= C1;
+      h2 ^= k2;
+      h2 = Long.rotateLeft(h2, R1);
+      h2 += h1;
+      h2 = h2 * M + N2;
+    }
+
+    // tail
+    long k1 = 0;
+    long k2 = 0;
+    int tailStart = nblocks << 4;
+    switch (length - tailStart) {
+      case 15:
+        k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+      case 14:
+        k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+      case 13:
+        k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+      case 12:
+        k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+      case 11:
+        k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+      case 10:
+        k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+      case 9:
+        k2 ^= (long) (data[offset + tailStart + 8] & 0xff);
+        k2 *= C2;
+        k2 = Long.rotateLeft(k2, R3);
+        k2 *= C1;
+        h2 ^= k2;
+
+      case 8:
+        k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+      case 7:
+        k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+      case 6:
+        k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+      case 5:
+        k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+      case 4:
+        k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+      case 3:
+        k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+      case 2:
+        k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+      case 1:
+        k1 ^= (long) (data[offset + tailStart] & 0xff);
+        k1 *= C1;
+        k1 = Long.rotateLeft(k1, R1);
+        k1 *= C2;
+        h1 ^= k1;
+    }
+
+    // finalization
+    h1 ^= length;
+    h2 ^= length;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    return new long[]{h1, h2};
+  }
+
+  private static long fmix64(long h) {
+    h ^= (h >>> 33);
+    h *= 0xff51afd7ed558ccdL;
+    h ^= (h >>> 33);
+    h *= 0xc4ceb9fe1a85ec53L;
+    h ^= (h >>> 33);
+    return h;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
----------------------------------------------------------------------
diff --git 
a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java 
b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
deleted file mode 100644
index 5facc7c..0000000
--- a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hive.common.util;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.util.Arrays;
-import java.util.Random;
-
-/**
- * Tests for Murmur3 variants.
- */
-public class TestMurmur3 {
-
-  @Test
-  public void testHashCodesM3_32_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_32(seed);
-    int hc1 = hf.hashBytes(key.getBytes()).asInt();
-    int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-
-    key = "testkey";
-    hc1 = hf.hashBytes(key.getBytes()).asInt();
-    hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed);
-    assertEquals(hc1, hc2);
-  }
-
-  @Test
-  public void testHashCodesM3_32_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_32_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_32(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      int hc1 = hf.hashBytes(data).asInt();
-      int hc2 = Murmur3.hash32(data, data.length, seed);
-      assertEquals(hc1, hc2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_string() {
-    String key = "test";
-    int seed = 123;
-    HashFunction hf = Hashing.murmur3_128(seed);
-    // guava stores the hashcodes in little endian order
-    ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    long gl1 = buf.getLong();
-    long gl2 = buf.getLong(8);
-    long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, 
seed);
-    long m1 = hc[0];
-    long m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    key = "testkey128_testkey128";
-    buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-    buf.put(hf.hashBytes(key.getBytes()).asBytes());
-    buf.flip();
-    gl1 = buf.getLong();
-    gl2 = buf.getLong(8);
-    byte[] keyBytes = key.getBytes();
-    hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed);
-    m1 = hc[0];
-    m2 = hc[1];
-    assertEquals(gl1, m1);
-    assertEquals(gl2, m2);
-
-    byte[] offsetKeyBytes = new byte[keyBytes.length + 35];
-    Arrays.fill(offsetKeyBytes, (byte) -1);
-    System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length);
-    hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed);
-    assertEquals(gl1, hc[0]);
-    assertEquals(gl2, hc[1]);
-  }
-
-  @Test
-  public void testHashCodeM3_64() {
-    byte[] origin = ("It was the best of times, it was the worst of times," +
-        " it was the age of wisdom, it was the age of foolishness," +
-        " it was the epoch of belief, it was the epoch of incredulity," +
-        " it was the season of Light, it was the season of Darkness," +
-        " it was the spring of hope, it was the winter of despair," +
-        " we had everything before us, we had nothing before us," +
-        " we were all going direct to Heaven," +
-        " we were all going direct the other way.").getBytes();
-    long hash = Murmur3.hash64(origin, 0, origin.length);
-    assertEquals(305830725663368540L, hash);
-
-    byte[] originOffset = new byte[origin.length + 150];
-    Arrays.fill(originOffset, (byte) 123);
-    System.arraycopy(origin, 0, originOffset, 150, origin.length);
-    hash = Murmur3.hash64(originOffset, 150, origin.length);
-    assertEquals(305830725663368540L, hash);
-  }
-
-  @Test
-  public void testHashCodesM3_128_ints() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      int val = rand.nextInt();
-      byte[] data = ByteBuffer.allocate(4).putInt(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-
-      byte[] offsetData = new byte[data.length + 50];
-      System.arraycopy(data, 0, offsetData, 50, data.length);
-      hc = Murmur3.hash128(offsetData, 50, data.length, seed);
-      assertEquals(gl1, hc[0]);
-      assertEquals(gl2, hc[1]);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_longs() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      long val = rand.nextLong();
-      byte[] data = ByteBuffer.allocate(8).putLong(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-
-  @Test
-  public void testHashCodesM3_128_double() {
-    int seed = 123;
-    Random rand = new Random(seed);
-    HashFunction hf = Hashing.murmur3_128(seed);
-    for (int i = 0; i < 1000; i++) {
-      double val = rand.nextDouble();
-      byte[] data = ByteBuffer.allocate(8).putDouble(val).array();
-      // guava stores the hashcodes in little endian order
-      ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN);
-      buf.put(hf.hashBytes(data).asBytes());
-      buf.flip();
-      long gl1 = buf.getLong();
-      long gl2 = buf.getLong(8);
-      long[] hc = Murmur3.hash128(data, 0, data.length, seed);
-      long m1 = hc[0];
-      long m2 = hc[1];
-      assertEquals(gl1, m1);
-      assertEquals(gl2, m2);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 876070b..7206503 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -37,7 +37,8 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcFile;
@@ -383,7 +384,9 @@ public final class FileDump {
           StringBuilder buf = new StringBuilder();
           String rowIdxString = getFormattedRowIndices(col, 
indices.getRowGroupIndex());
           buf.append(rowIdxString);
-          String bloomFilString = getFormattedBloomFilters(col, 
indices.getBloomFilterIndex());
+          String bloomFilString = getFormattedBloomFilters(col, indices,
+              reader.getWriterVersion(),
+              reader.getSchema().findSubtype(col).getCategory());
           buf.append(bloomFilString);
           System.out.println(buf);
         }
@@ -604,15 +607,18 @@ public final class FileDump {
     return -1;
   }
 
-  private static String getFormattedBloomFilters(int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+  private static String getFormattedBloomFilters(int col, OrcIndex index,
+                                                 OrcFile.WriterVersion version,
+                                                 TypeDescription.Category 
type) {
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     StringBuilder buf = new StringBuilder();
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int idx = 0;
       buf.append("\n    Bloom filters for column ").append(col).append(":");
       for (OrcProto.BloomFilter bf : 
bloomFilterIndex[col].getBloomFilterList()) {
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         buf.append("\n      Entry 
").append(idx++).append(":").append(getBloomFilterStats(toMerge));
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -626,7 +632,7 @@ public final class FileDump {
     return buf.toString();
   }
 
-  private static String getBloomFilterStats(BloomFilterIO bf) {
+  private static String getBloomFilterStats(BloomFilter bf) {
     StringBuilder sb = new StringBuilder();
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java 
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index e2048ea..aa3072c 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -20,18 +20,20 @@ package org.apache.orc.tools;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
 import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
 import org.apache.orc.impl.AcidStats;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.apache.orc.impl.RecordReaderImpl;
+import org.apache.orc.util.BloomFilter;
 import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.BinaryColumnStatistics;
 import org.apache.orc.BooleanColumnStatistics;
 import org.apache.orc.ColumnStatistics;
@@ -50,12 +52,16 @@ import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
 import org.codehaus.jettison.json.JSONStringer;
 import org.codehaus.jettison.json.JSONWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * File dump tool with json formatted output.
  */
 public class JsonFileDump {
 
+  private static final Logger LOG = 
LoggerFactory.getLogger(JsonFileDump.class);
+
   public static void printJsonMetaData(List<String> files,
       Configuration conf,
       List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
@@ -185,7 +191,9 @@ public class JsonFileDump {
               writer.object();
               writer.key("columnId").value(col);
               writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
-              writeBloomFilterIndexes(writer, col, 
indices.getBloomFilterIndex());
+              writeBloomFilterIndexes(writer, col, indices,
+                  reader.getWriterVersion(),
+                  reader.getSchema().findSubtype(col).getCategory());
               writer.endObject();
             }
             writer.endArray();
@@ -334,16 +342,21 @@ public class JsonFileDump {
   }
 
   private static void writeBloomFilterIndexes(JSONWriter writer, int col,
-      OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+                                              OrcIndex index,
+                                              OrcFile.WriterVersion version,
+                                              TypeDescription.Category type
+                                              ) throws JSONException {
 
-    BloomFilterIO stripeLevelBF = null;
+    BloomFilter stripeLevelBF = null;
+    OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
     if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
       int entryIx = 0;
       writer.key("bloomFilterIndexes").array();
       for (OrcProto.BloomFilter bf : 
bloomFilterIndex[col].getBloomFilterList()) {
         writer.object();
         writer.key("entryId").value(entryIx++);
-        BloomFilterIO toMerge = new BloomFilterIO(bf);
+        BloomFilter toMerge = BloomFilterIO.deserialize(
+            index.getBloomFilterKinds()[col], version, type, bf);
         writeBloomFilterStats(writer, toMerge);
         if (stripeLevelBF == null) {
           stripeLevelBF = toMerge;
@@ -362,7 +375,7 @@ public class JsonFileDump {
     }
   }
 
-  private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO 
bf)
+  private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf)
       throws JSONException {
     int bitCount = bf.getBitSize();
     int popCount = 0;

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 10cc87d..65ff404 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -445,8 +445,9 @@ public class TestFileDump {
         .compress(CompressionKind.ZLIB)
         .bufferSize(10000)
         .rowIndexStride(1000)
-        .bloomFilterColumns("l")
-        .bloomFilterFpp(0.01);
+        .bloomFilterColumns("l,s")
+        .bloomFilterFpp(0.01)
+        .bloomFilterVersion(OrcFile.BloomFilterVersion.ORIGINAL);
     VectorizedRowBatch batch = schema.createRowBatch(1000);
     Writer writer = OrcFile.createWriter(testFilePath, options);
     Random r1 = new Random(1);
@@ -483,7 +484,6 @@ public class TestFileDump {
     System.out.flush();
     System.setOut(origOut);
 
-
     checkOutput(outputFilename, workDir + File.separator + outputFilename);
   }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/log4j.properties 
b/java/tools/src/test/resources/log4j.properties
new file mode 100644
index 0000000..8224baf
--- /dev/null
+++ b/java/tools/src/test/resources/log4j.properties
@@ -0,0 +1,21 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=WARN,stdout
+
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target   = System.err
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
+
+# Suppress the warnings about native io not being available
+log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index 18fd2fb..e23327a 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,17 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 749
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 3 section ROW_INDEX start: 355 length 87
-    Stream: column 3 section BLOOM_FILTER start: 442 length 512
-    Stream: column 1 section DATA start: 954 length 20035
-    Stream: column 2 section DATA start: 20989 length 40050
-    Stream: column 3 section DATA start: 61039 length 3543
-    Stream: column 3 section LENGTH start: 64582 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 64607 length 133
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 310
+    Stream: column 1 section DATA start: 752 length 20035
+    Stream: column 2 section DATA start: 20787 length 40050
+    Stream: column 3 section DATA start: 60837 length 3543
+    Stream: column 3 section LENGTH start: 64380 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 64405 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +67,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944
-    Stream: column 0 section ROW_INDEX start: 64826 length 17
-    Stream: column 1 section ROW_INDEX start: 64843 length 164
-    Stream: column 2 section ROW_INDEX start: 65007 length 168
-    Stream: column 3 section ROW_INDEX start: 65175 length 83
-    Stream: column 3 section BLOOM_FILTER start: 65258 length 512
-    Stream: column 1 section DATA start: 65770 length 20035
-    Stream: column 2 section DATA start: 85805 length 40050
-    Stream: column 3 section DATA start: 125855 length 3532
-    Stream: column 3 section LENGTH start: 129387 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 129412 length 133
+  Stripe: offset: 64624 data: 63775 rows: 5000 tail: 86 index: 742
+    Stream: column 0 section ROW_INDEX start: 64624 length 17
+    Stream: column 1 section ROW_INDEX start: 64641 length 164
+    Stream: column 2 section ROW_INDEX start: 64805 length 168
+    Stream: column 3 section ROW_INDEX start: 64973 length 83
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 65056 length 310
+    Stream: column 1 section DATA start: 65366 length 20035
+    Stream: column 2 section DATA start: 85401 length 40050
+    Stream: column 3 section DATA start: 125451 length 3532
+    Stream: column 3 section LENGTH start: 128983 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 129008 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +95,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950
-    Stream: column 0 section ROW_INDEX start: 129631 length 17
-    Stream: column 1 section ROW_INDEX start: 129648 length 163
-    Stream: column 2 section ROW_INDEX start: 129811 length 168
-    Stream: column 3 section ROW_INDEX start: 129979 length 90
-    Stream: column 3 section BLOOM_FILTER start: 130069 length 512
-    Stream: column 1 section DATA start: 130581 length 20035
-    Stream: column 2 section DATA start: 150616 length 40050
-    Stream: column 3 section DATA start: 190666 length 3544
-    Stream: column 3 section LENGTH start: 194210 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 194235 length 133
+  Stripe: offset: 129227 data: 63787 rows: 5000 tail: 86 index: 748
+    Stream: column 0 section ROW_INDEX start: 129227 length 17
+    Stream: column 1 section ROW_INDEX start: 129244 length 163
+    Stream: column 2 section ROW_INDEX start: 129407 length 168
+    Stream: column 3 section ROW_INDEX start: 129575 length 90
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 129665 length 310
+    Stream: column 1 section DATA start: 129975 length 20035
+    Stream: column 2 section DATA start: 150010 length 40050
+    Stream: column 3 section DATA start: 190060 length 3544
+    Stream: column 3 section LENGTH start: 193604 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 193629 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +123,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952
-    Stream: column 0 section ROW_INDEX start: 194454 length 17
-    Stream: column 1 section ROW_INDEX start: 194471 length 165
-    Stream: column 2 section ROW_INDEX start: 194636 length 167
-    Stream: column 3 section ROW_INDEX start: 194803 length 91
-    Stream: column 3 section BLOOM_FILTER start: 194894 length 512
-    Stream: column 1 section DATA start: 195406 length 20035
-    Stream: column 2 section DATA start: 215441 length 40050
-    Stream: column 3 section DATA start: 255491 length 3574
-    Stream: column 3 section LENGTH start: 259065 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 259090 length 133
+  Stripe: offset: 193848 data: 63817 rows: 5000 tail: 85 index: 750
+    Stream: column 0 section ROW_INDEX start: 193848 length 17
+    Stream: column 1 section ROW_INDEX start: 193865 length 165
+    Stream: column 2 section ROW_INDEX start: 194030 length 167
+    Stream: column 3 section ROW_INDEX start: 194197 length 91
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 194288 length 310
+    Stream: column 1 section DATA start: 194598 length 20035
+    Stream: column 2 section DATA start: 214633 length 40050
+    Stream: column 3 section DATA start: 254683 length 3574
+    Stream: column 3 section LENGTH start: 258257 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 258282 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +151,17 @@ Stripes:
       Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
-  Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432
-    Stream: column 0 section ROW_INDEX start: 259309 length 12
-    Stream: column 1 section ROW_INDEX start: 259321 length 38
-    Stream: column 2 section ROW_INDEX start: 259359 length 41
-    Stream: column 3 section ROW_INDEX start: 259400 length 40
-    Stream: column 3 section BLOOM_FILTER start: 259440 length 301
-    Stream: column 1 section DATA start: 259741 length 4007
-    Stream: column 2 section DATA start: 263748 length 8010
-    Stream: column 3 section DATA start: 271758 length 768
-    Stream: column 3 section LENGTH start: 272526 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 272551 length 133
+  Stripe: offset: 258500 data: 12943 rows: 1000 tail: 78 index: 375
+    Stream: column 0 section ROW_INDEX start: 258500 length 12
+    Stream: column 1 section ROW_INDEX start: 258512 length 38
+    Stream: column 2 section ROW_INDEX start: 258550 length 41
+    Stream: column 3 section ROW_INDEX start: 258591 length 40
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 258631 length 244
+    Stream: column 1 section DATA start: 258875 length 4007
+    Stream: column 2 section DATA start: 262882 length 8010
+    Stream: column 3 section DATA start: 270892 length 768
+    Stream: column 3 section LENGTH start: 271660 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 271685 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +172,7 @@ Stripes:
       Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 
0.022 expectedFpp: 2.343647E-7
       Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 
loadFactor: 0.022 expectedFpp: 2.343647E-7
 
-File length: 273307 bytes
+File length: 272444 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index fa5cc2d..8296382 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096
@@ -39,17 +39,20 @@ File Statistics:
   Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761
 
 Stripes:
-  Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974
+  Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14949
     Stream: column 0 section ROW_INDEX start: 3 length 17
     Stream: column 1 section ROW_INDEX start: 20 length 166
     Stream: column 2 section ROW_INDEX start: 186 length 169
     Stream: column 2 section BLOOM_FILTER start: 355 length 6535
-    Stream: column 3 section ROW_INDEX start: 6890 length 87
-    Stream: column 1 section DATA start: 6977 length 20035
-    Stream: column 2 section DATA start: 27012 length 40050
-    Stream: column 3 section DATA start: 67062 length 3543
-    Stream: column 3 section LENGTH start: 70605 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 70630 length 133
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046
+    Stream: column 3 section ROW_INDEX start: 12936 length 87
+    Stream: column 3 section BLOOM_FILTER start: 13023 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 891
+    Stream: column 1 section DATA start: 14952 length 20035
+    Stream: column 2 section DATA start: 34987 length 40050
+    Stream: column 3 section DATA start: 75037 length 3543
+    Stream: column 3 section LENGTH start: 78580 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 78605 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -67,17 +70,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 
0.5178 expectedFpp: 0.009981772
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 
0.5155 expectedFpp: 0.009676614
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 
loadFactor: 0.9736 expectedFpp: 0.829482
-  Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965
-    Stream: column 0 section ROW_INDEX start: 70848 length 17
-    Stream: column 1 section ROW_INDEX start: 70865 length 164
-    Stream: column 2 section ROW_INDEX start: 71029 length 168
-    Stream: column 2 section BLOOM_FILTER start: 71197 length 6533
-    Stream: column 3 section ROW_INDEX start: 77730 length 83
-    Stream: column 1 section DATA start: 77813 length 20035
-    Stream: column 2 section DATA start: 97848 length 40050
-    Stream: column 3 section DATA start: 137898 length 3532
-    Stream: column 3 section LENGTH start: 141430 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 141455 length 133
+  Stripe: offset: 78842 data: 63775 rows: 5000 tail: 103 index: 14940
+    Stream: column 0 section ROW_INDEX start: 78842 length 17
+    Stream: column 1 section ROW_INDEX start: 78859 length 164
+    Stream: column 2 section ROW_INDEX start: 79023 length 168
+    Stream: column 2 section BLOOM_FILTER start: 79191 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 85724 length 6046
+    Stream: column 3 section ROW_INDEX start: 91770 length 83
+    Stream: column 3 section BLOOM_FILTER start: 91853 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 92891 length 891
+    Stream: column 1 section DATA start: 93782 length 20035
+    Stream: column 2 section DATA start: 113817 length 40050
+    Stream: column 3 section DATA start: 153867 length 3532
+    Stream: column 3 section LENGTH start: 157399 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 157424 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -95,17 +101,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 
0.5169 expectedFpp: 0.009855959
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 
0.5173 expectedFpp: 0.009911705
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 
loadFactor: 0.9733 expectedFpp: 0.8276205
-  Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971
-    Stream: column 0 section ROW_INDEX start: 141673 length 17
-    Stream: column 1 section ROW_INDEX start: 141690 length 163
-    Stream: column 2 section ROW_INDEX start: 141853 length 168
-    Stream: column 2 section BLOOM_FILTER start: 142021 length 6533
-    Stream: column 3 section ROW_INDEX start: 148554 length 90
-    Stream: column 1 section DATA start: 148644 length 20035
-    Stream: column 2 section DATA start: 168679 length 40050
-    Stream: column 3 section DATA start: 208729 length 3544
-    Stream: column 3 section LENGTH start: 212273 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 212298 length 133
+  Stripe: offset: 157660 data: 63787 rows: 5000 tail: 104 index: 14946
+    Stream: column 0 section ROW_INDEX start: 157660 length 17
+    Stream: column 1 section ROW_INDEX start: 157677 length 163
+    Stream: column 2 section ROW_INDEX start: 157840 length 168
+    Stream: column 2 section BLOOM_FILTER start: 158008 length 6533
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 164541 length 6046
+    Stream: column 3 section ROW_INDEX start: 170587 length 90
+    Stream: column 3 section BLOOM_FILTER start: 170677 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 171715 length 891
+    Stream: column 1 section DATA start: 172606 length 20035
+    Stream: column 2 section DATA start: 192641 length 40050
+    Stream: column 3 section DATA start: 232691 length 3544
+    Stream: column 3 section LENGTH start: 236235 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 236260 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -123,17 +132,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 
0.5149 expectedFpp: 0.009594797
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 
0.5135 expectedFpp: 0.009419539
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 
loadFactor: 0.9722 expectedFpp: 0.82082444
-  Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964
-    Stream: column 0 section ROW_INDEX start: 212516 length 17
-    Stream: column 1 section ROW_INDEX start: 212533 length 165
-    Stream: column 2 section ROW_INDEX start: 212698 length 167
-    Stream: column 2 section BLOOM_FILTER start: 212865 length 6524
-    Stream: column 3 section ROW_INDEX start: 219389 length 91
-    Stream: column 1 section DATA start: 219480 length 20035
-    Stream: column 2 section DATA start: 239515 length 40050
-    Stream: column 3 section DATA start: 279565 length 3574
-    Stream: column 3 section LENGTH start: 283139 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 283164 length 133
+  Stripe: offset: 236497 data: 63817 rows: 5000 tail: 103 index: 14939
+    Stream: column 0 section ROW_INDEX start: 236497 length 17
+    Stream: column 1 section ROW_INDEX start: 236514 length 165
+    Stream: column 2 section ROW_INDEX start: 236679 length 167
+    Stream: column 2 section BLOOM_FILTER start: 236846 length 6524
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 243370 length 6046
+    Stream: column 3 section ROW_INDEX start: 249416 length 91
+    Stream: column 3 section BLOOM_FILTER start: 249507 length 1038
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 250545 length 891
+    Stream: column 1 section DATA start: 251436 length 20035
+    Stream: column 2 section DATA start: 271471 length 40050
+    Stream: column 3 section DATA start: 311521 length 3574
+    Stream: column 3 section LENGTH start: 315095 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 315120 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -151,17 +163,20 @@ Stripes:
       Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 
0.5147 expectedFpp: 0.009567649
       Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 
0.5201 expectedFpp: 0.010295142
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 
loadFactor: 0.9743 expectedFpp: 0.8332165
-  Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468
-    Stream: column 0 section ROW_INDEX start: 283382 length 12
-    Stream: column 1 section ROW_INDEX start: 283394 length 38
-    Stream: column 2 section ROW_INDEX start: 283432 length 41
-    Stream: column 2 section BLOOM_FILTER start: 283473 length 1337
-    Stream: column 3 section ROW_INDEX start: 284810 length 40
-    Stream: column 1 section DATA start: 284850 length 4007
-    Stream: column 2 section DATA start: 288857 length 8010
-    Stream: column 3 section DATA start: 296867 length 768
-    Stream: column 3 section LENGTH start: 297635 length 25
-    Stream: column 3 section DICTIONARY_DATA start: 297660 length 133
+  Stripe: offset: 315356 data: 12943 rows: 1000 tail: 96 index: 3546
+    Stream: column 0 section ROW_INDEX start: 315356 length 12
+    Stream: column 1 section ROW_INDEX start: 315368 length 38
+    Stream: column 2 section ROW_INDEX start: 315406 length 41
+    Stream: column 2 section BLOOM_FILTER start: 315447 length 1337
+    Stream: column 2 section BLOOM_FILTER_UTF8 start: 316784 length 1211
+    Stream: column 3 section ROW_INDEX start: 317995 length 40
+    Stream: column 3 section BLOOM_FILTER start: 318035 length 472
+    Stream: column 3 section BLOOM_FILTER_UTF8 start: 318507 length 395
+    Stream: column 1 section DATA start: 318902 length 4007
+    Stream: column 2 section DATA start: 322909 length 8010
+    Stream: column 3 section DATA start: 330919 length 768
+    Stream: column 3 section LENGTH start: 331687 length 25
+    Stream: column 3 section DICTIONARY_DATA start: 331712 length 133
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
@@ -172,7 +187,7 @@ Stripes:
       Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 
0.5154 expectedFpp: 0.00966294
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 
loadFactor: 0.5154 expectedFpp: 0.00966294
 
-File length: 298416 bytes
+File length: 332489 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
----------------------------------------------------------------------
diff --git 
a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out 
b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 17a964b..4b0822f 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-dump.json
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.json 
b/java/tools/src/test/resources/orc-file-dump.json
index bf654a1..b3e9d12 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1,7 +1,7 @@
 {
   "fileName": "TestFileDump.testDump.orc",
   "fileVersion": "0.12",
-  "writerVersion": "HIVE_13083",
+  "writerVersion": "ORC_101",
   "numberOfRows": 21000,
   "compression": "ZLIB",
   "compressionBufferSize": 4096,
@@ -254,9 +254,9 @@
       "stripeNumber": 1,
       "stripeInformation": {
         "offset": 3,
-        "indexLength": 970,
+        "indexLength": 768,
         "dataLength": 63770,
-        "footerLength": 90,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
@@ -286,44 +286,44 @@
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
+          "section": "BLOOM_FILTER_UTF8",
           "startOffset": 461,
-          "length": 512
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 973,
+          "startOffset": 771,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 21008,
+          "startOffset": 20806,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 61058,
+          "startOffset": 60856,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 61075,
+          "startOffset": 60873,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 64585,
+          "startOffset": 64383,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 64610,
+          "startOffset": 64408,
           "length": 133
         }
       ],
@@ -494,77 +494,77 @@
     {
       "stripeNumber": 2,
       "stripeInformation": {
-        "offset": 64833,
-        "indexLength": 961,
+        "offset": 64629,
+        "indexLength": 759,
         "dataLength": 63763,
-        "footerLength": 88,
+        "footerLength": 87,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 64833,
+          "startOffset": 64629,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 64850,
+          "startOffset": 64646,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 65016,
+          "startOffset": 64812,
           "length": 166
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 65182,
+          "startOffset": 64978,
           "length": 100
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 65282,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 65078,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 65794,
+          "startOffset": 65388,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 85829,
+          "startOffset": 85423,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 125879,
+          "startOffset": 125473,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 125896,
+          "startOffset": 125490,
           "length": 3503
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 129399,
+          "startOffset": 128993,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 129424,
+          "startOffset": 129018,
           "length": 133
         }
       ],
@@ -735,77 +735,77 @@
     {
       "stripeNumber": 3,
       "stripeInformation": {
-        "offset": 129645,
-        "indexLength": 962,
+        "offset": 129238,
+        "indexLength": 760,
         "dataLength": 63770,
-        "footerLength": 91,
+        "footerLength": 88,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 129645,
+          "startOffset": 129238,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 129662,
+          "startOffset": 129255,
           "length": 164
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 129826,
+          "startOffset": 129419,
           "length": 167
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 129993,
+          "startOffset": 129586,
           "length": 102
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 130095,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 129688,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 130607,
+          "startOffset": 129998,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 150642,
+          "startOffset": 150033,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 190692,
+          "startOffset": 190083,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 190709,
+          "startOffset": 190100,
           "length": 3510
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 194219,
+          "startOffset": 193610,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 194244,
+          "startOffset": 193635,
           "length": 133
         }
       ],
@@ -976,77 +976,77 @@
     {
       "stripeNumber": 4,
       "stripeInformation": {
-        "offset": 194468,
-        "indexLength": 973,
+        "offset": 193856,
+        "indexLength": 771,
         "dataLength": 63756,
-        "footerLength": 91,
+        "footerLength": 89,
         "rowCount": 5000
       },
       "streams": [
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 194468,
+          "startOffset": 193856,
           "length": 17
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 194485,
+          "startOffset": 193873,
           "length": 166
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 194651,
+          "startOffset": 194039,
           "length": 171
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 194822,
+          "startOffset": 194210,
           "length": 107
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 194929,
-          "length": 512
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 194317,
+          "length": 310
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 195441,
+          "startOffset": 194627,
           "length": 20035
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 215476,
+          "startOffset": 214662,
           "length": 40050
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 255526,
+          "startOffset": 254712,
           "length": 17
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 255543,
+          "startOffset": 254729,
           "length": 3496
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 259039,
+          "startOffset": 258225,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 259064,
+          "startOffset": 258250,
           "length": 133
         }
       ],
@@ -1217,8 +1217,8 @@
     {
       "stripeNumber": 5,
       "stripeInformation": {
-        "offset": 259288,
-        "indexLength": 433,
+        "offset": 258472,
+        "indexLength": 376,
         "dataLength": 12943,
         "footerLength": 83,
         "rowCount": 1000
@@ -1227,67 +1227,67 @@
         {
           "columnId": 0,
           "section": "ROW_INDEX",
-          "startOffset": 259288,
+          "startOffset": 258472,
           "length": 12
         },
         {
           "columnId": 1,
           "section": "ROW_INDEX",
-          "startOffset": 259300,
+          "startOffset": 258484,
           "length": 38
         },
         {
           "columnId": 2,
           "section": "ROW_INDEX",
-          "startOffset": 259338,
+          "startOffset": 258522,
           "length": 41
         },
         {
           "columnId": 3,
           "section": "ROW_INDEX",
-          "startOffset": 259379,
+          "startOffset": 258563,
           "length": 41
         },
         {
           "columnId": 3,
-          "section": "BLOOM_FILTER",
-          "startOffset": 259420,
-          "length": 301
+          "section": "BLOOM_FILTER_UTF8",
+          "startOffset": 258604,
+          "length": 244
         },
         {
           "columnId": 1,
           "section": "DATA",
-          "startOffset": 259721,
+          "startOffset": 258848,
           "length": 4007
         },
         {
           "columnId": 2,
           "section": "DATA",
-          "startOffset": 263728,
+          "startOffset": 262855,
           "length": 8010
         },
         {
           "columnId": 3,
           "section": "PRESENT",
-          "startOffset": 271738,
+          "startOffset": 270865,
           "length": 16
         },
         {
           "columnId": 3,
           "section": "DATA",
-          "startOffset": 271754,
+          "startOffset": 270881,
           "length": 752
         },
         {
           "columnId": 3,
           "section": "LENGTH",
-          "startOffset": 272506,
+          "startOffset": 271633,
           "length": 25
         },
         {
           "columnId": 3,
           "section": "DICTIONARY_DATA",
-          "startOffset": 272531,
+          "startOffset": 271658,
           "length": 133
         }
       ],
@@ -1348,7 +1348,7 @@
       }]
     }
   ],
-  "fileLength": 273300,
+  "fileLength": 272428,
   "paddingLength": 0,
   "paddingRatio": 0,
   "status": "OK"

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-dump.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-dump.out 
b/java/tools/src/test/resources/orc-file-dump.out
index 70f7fbd..ae8195e 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 21000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/tools/src/test/resources/orc-file-has-null.out
----------------------------------------------------------------------
diff --git a/java/tools/src/test/resources/orc-file-has-null.out 
b/java/tools/src/test/resources/orc-file-has-null.out
index df075d5..c02f803 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -1,5 +1,5 @@
 Structure for TestFileDump.testDump.orc
-File Version: 0.12 with HIVE_13083
+File Version: 0.12 with ORC_101
 Rows: 20000
 Compression: ZLIB
 Compression size: 4096

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/proto/orc_proto.proto
----------------------------------------------------------------------
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index dbc34ab..de6974e 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -91,6 +91,7 @@ message RowIndex {
 message BloomFilter {
   optional uint32 numHashFunctions = 1;
   repeated fixed64 bitset = 2;
+  optional bytes utf8bitset = 3;
 }
 
 message BloomFilterIndex {
@@ -109,6 +110,7 @@ message Stream {
     SECONDARY = 5;
     ROW_INDEX = 6;
     BLOOM_FILTER = 7;
+    BLOOM_FILTER_UTF8 = 8;
   }
   optional Kind kind = 1;
   optional uint32 column = 2;

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/site/_data/releases.yml
----------------------------------------------------------------------
diff --git a/site/_data/releases.yml b/site/_data/releases.yml
index 3331688..1282115 100644
--- a/site/_data/releases.yml
+++ b/site/_data/releases.yml
@@ -9,6 +9,7 @@
   sha256: 5c394c7ed3a31d20726ded55ed9c5a0eeff1bd5b85b1cb2ee6c3c1a94560578c
   known-issues:
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.2:
   date: 2016-07-08
@@ -19,6 +20,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.1:
   date: 2016-06-13
@@ -29,6 +31,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.1.0:
   date: 2016-06-10
@@ -39,6 +42,7 @@
   known-issues:
     HIVE-14214: Schema evolution and predicate pushdown don't work together.
     ORC-40: Predicate push down is not implemented in C++.
+    ORC-101: Bloom filters for string and decimal use inconsistent encoding
 
 1.0.0:
   date: 2016-01-25

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/site/_docs/spec-index.md
----------------------------------------------------------------------
diff --git a/site/_docs/spec-index.md b/site/_docs/spec-index.md
index 009df59..263c9a8 100644
--- a/site/_docs/spec-index.md
+++ b/site/_docs/spec-index.md
@@ -57,14 +57,17 @@ group (default to 10,000 rows) in a column. Only the row 
groups that
 satisfy min/max row index evaluation will be evaluated against the
 bloom filter index.
 
-Each BloomFilterEntry stores the number of hash functions ('k') used and
-the bitset backing the bloom filter. The bitset is serialized as repeated
-longs from which the number of bits ('m') for the bloom filter can be derived.
-m = bitset.length * 64.
+Each BloomFilterEntry stores the number of hash functions ('k') used
+and the bitset backing the bloom filter. The original encoding (pre
+ORC-101) of bloom filters used the bitset field encoded as a repeating
+sequence of longs in the bitset field with a little endian encoding
+(0x1 is bit 0 and 0x2 is bit 1.) After ORC-101, the encoding is a
+sequence of bytes with a little endian encoding in the utf8bitset field.
 
 ```message BloomFilter {
  optional uint32 numHashFunctions = 1;
  repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
 }
 ```
 

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/site/_docs/stripes.md
----------------------------------------------------------------------
diff --git a/site/_docs/stripes.md b/site/_docs/stripes.md
index d53f709..cc85feb 100644
--- a/site/_docs/stripes.md
+++ b/site/_docs/stripes.md
@@ -56,6 +56,10 @@ depends on the type and encoding of the column.
  SECONDARY = 5;
  // the index for seeking to particular row groups
  ROW_INDEX = 6;
+ // original bloom filters used before ORC-101
+ BLOOM_FILTER = 7;
+ // bloom filters that consistently use utf8
+ BLOOM_FILTER_UTF8 = 8;
  }
  required Kind kind = 1;
  // the column id

Reply via email to