[3/3] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

omalley Thu, 22 Sep 2016 12:26:43 -0700

ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.


Fixes #60

Signed-off-by: Owen O'Malley <omal...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/833cc0e7
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/833cc0e7
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/833cc0e7

Branch: refs/heads/master
Commit: 833cc0e73a301bf1aab4ae310e72318ed586efcb
Parents: 7118e96
Author: Owen O'Malley <omal...@apache.org>
Authored: Tue Sep 13 13:28:44 2016 -0700
Committer: Owen O'Malley <omal...@apache.org>
Committed: Thu Sep 22 14:25:27 2016 -0500

----------------------------------------------------------------------
 c++/include/orc/Reader.hh                       |   1 +
 c++/src/Reader.cc                               |   2 +
 .../src/java/org/apache/orc/BloomFilterIO.java  |  50 --
 .../src/java/org/apache/orc/DataReader.java     |   7 +-
 java/core/src/java/org/apache/orc/OrcConf.java  |  10 +
 java/core/src/java/org/apache/orc/OrcFile.java  |  51 +-
 .../java/org/apache/orc/TypeDescription.java    |  26 +
 .../orc/impl/ConvertTreeReaderFactory.java      |  12 +-
 .../src/java/org/apache/orc/impl/OrcIndex.java  |  10 +-
 .../org/apache/orc/impl/RecordReaderImpl.java   |  70 ++-
 .../org/apache/orc/impl/RecordReaderUtils.java  | 192 ++++++--
 .../org/apache/orc/impl/SchemaEvolution.java    |   4 +
 .../java/org/apache/orc/impl/StreamName.java    |   1 +
 .../java/org/apache/orc/impl/WriterImpl.java    | 228 +++++++--
 .../java/org/apache/orc/util/BloomFilter.java   | 328 +++++++++++++
 .../java/org/apache/orc/util/BloomFilterIO.java |  95 ++++
 .../org/apache/orc/util/BloomFilterUtf8.java    |  55 +++
 .../test/org/apache/orc/TestVectorOrcFile.java  |   4 +-
 .../apache/orc/impl/TestRecordReaderImpl.java   | 484 +++++++++++++------
 .../org/apache/orc/util/TestBloomFilter.java    |  92 ++++
 .../test/org/apache/orc/util/TestMurmur3.java   | 225 +++++++++
 java/core/src/test/resources/log4j.properties   |   3 +
 .../src/test/resources/log4j.properties         |   3 +
 .../apache/hive/common/util/BloomFilter.java    | 313 ------------
 .../org/apache/hive/common/util/Murmur3.java    | 335 -------------
 .../src/java/org/apache/orc/util/Murmur3.java   | 335 +++++++++++++
 .../apache/hive/common/util/TestMurmur3.java    | 224 ---------
 .../src/java/org/apache/orc/tools/FileDump.java |  20 +-
 .../java/org/apache/orc/tools/JsonFileDump.java |  27 +-
 .../test/org/apache/orc/tools/TestFileDump.java |   6 +-
 java/tools/src/test/resources/log4j.properties  |  21 +
 .../resources/orc-file-dump-bloomfilter.out     | 106 ++--
 .../resources/orc-file-dump-bloomfilter2.out    | 121 +++--
 .../orc-file-dump-dictionary-threshold.out      |   2 +-
 .../tools/src/test/resources/orc-file-dump.json | 150 +++---
 java/tools/src/test/resources/orc-file-dump.out |   2 +-
 .../src/test/resources/orc-file-has-null.out    |   2 +-
 proto/orc_proto.proto                           |   2 +
 site/_data/releases.yml                         |   4 +
 site/_docs/spec-index.md                        |  11 +-
 site/_docs/stripes.md                           |   4 +
 41 files changed, 2240 insertions(+), 1398 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 25a0a17..eacbd80 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -53,6 +53,7 @@ namespace orc {
     WriterVersion_HIVE_4243 = 2,
     WriterVersion_HIVE_12055 = 3,
     WriterVersion_HIVE_13083 = 4,
+    WriterVersion_ORC_101 = 5,
     WriterVersion_MAX = INT64_MAX
   };
 

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 9b1f1b9..91f4ea1 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -72,6 +72,8 @@ namespace orc {
       return "HIVE-12055";
     case WriterVersion_HIVE_13083:
       return "HIVE-13083";
+    case WriterVersion_ORC_101:
+      return "ORC-101";
     }
     std::stringstream buffer;
     buffer << "future - " << version;

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/BloomFilterIO.java 
b/java/core/src/java/org/apache/orc/BloomFilterIO.java
deleted file mode 100644
index 106227d..0000000
--- a/java/core/src/java/org/apache/orc/BloomFilterIO.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hive.common.util.BloomFilter;
-
-public class BloomFilterIO extends BloomFilter {
-
-  public BloomFilterIO(long expectedEntries) {
-    super(expectedEntries, DEFAULT_FPP);
-  }
-
-  public BloomFilterIO(long expectedEntries, double fpp) {
-    super(expectedEntries, fpp);
-  }
-
-  static long[] toArray(OrcProto.BloomFilter filter) {
-    long[] result = new long[filter.getBitsetCount()];
-    int i =0;
-    for(Long l: filter.getBitsetList()) {
-      result[i++] = l;
-    }
-    return result;
-  }
-
-/**
- * Initializes the BloomFilter from the given Orc BloomFilter
- */
-  public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
-    this.bitSet = new BitSet(toArray(bloomFilter));
-    this.numHashFunctions = bloomFilter.getNumHashFunctions();
-    this.numBits = (int) this.bitSet.bitSize();
-  }
-}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/DataReader.java 
b/java/core/src/java/org/apache/orc/DataReader.java
index a5dbb76..b3f91f2 100644
--- a/java/core/src/java/org/apache/orc/DataReader.java
+++ b/java/core/src/java/org/apache/orc/DataReader.java
@@ -31,9 +31,14 @@ public interface DataReader extends AutoCloseable {
   void open() throws IOException;
 
   OrcIndex readRowIndex(StripeInformation stripe,
+                        TypeDescription fileSchema,
                         OrcProto.StripeFooter footer,
-                        boolean[] included, OrcProto.RowIndex[] indexes,
+                        boolean ignoreNonUtf8BloomFilter,
+                        boolean[] included,
+                        OrcProto.RowIndex[] indexes,
                         boolean[] sargColumns,
+                        OrcFile.WriterVersion version,
+                        OrcProto.Stream.Kind[] bloomFilterKinds,
                         OrcProto.BloomFilterIndex[] bloomFilterIndices
                         ) throws IOException;
 

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java 
b/java/core/src/java/org/apache/orc/OrcConf.java
index ac8e3f0..05ab13b 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -105,6 +105,16 @@ public enum OrcConf {
           "dictionary or not will be retained thereafter."),
   BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
       "", "List of columns to create bloom filters for when writing."),
+  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
+      "orc.bloom.filter.write.version", 
OrcFile.BloomFilterVersion.UTF8.toString(),
+      "Which version of the bloom filters should we write.\n" +
+          "The choices are:\n" +
+          "  original - writes two versions of the bloom filters for use by\n" 
+
+          "             both old and new readers.\n" +
+          "  utf8 - writes just the new bloom filters."),
+  IGNORE_NON_UTF8_BLOOM_FILTERS("orc.bloom.filter.ignore.non-utf8",
+      "orc.bloom.filter.ignore.non-utf8", false,
+      "Should the reader ignore the obsolete non-UTF8 bloom filters."),
   MAX_FILE_LENGTH("orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE,
       "The maximum size of the file to read for finding the file tail. This\n" 
+
           "is primarily used for streaming ingest to read intermediate\n" +

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java 
b/java/core/src/java/org/apache/orc/OrcFile.java
index ddfa9f7..6b2d48e 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -108,6 +108,7 @@ public class OrcFile {
     HIVE_4243(2), // use real column names from Hive tables
     HIVE_12055(3), // vectorized writer
     HIVE_13083(4), // decimal writer updating present stream wrongly
+    ORC_101(5),    // bloom filters use utf8
 
     // Don't use any magic numbers here except for the below:
     FUTURE(Integer.MAX_VALUE); // a version from a future writer
@@ -144,8 +145,12 @@ public class OrcFile {
       if (val == FUTURE.id) return FUTURE; // Special handling for the magic 
value.
       return values[val];
     }
+
+    public boolean includes(WriterVersion other) {
+      return id >= other.id;
+    }
   }
-  public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083;
+  public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_101;
 
   public enum EncodingStrategy {
     SPEED, COMPRESSION
@@ -231,6 +236,33 @@ public class OrcFile {
     void preFooterWrite(WriterContext context) throws IOException;
   }
 
+  public static enum BloomFilterVersion {
+    // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
+    // both old and new readers.
+    ORIGINAL("original"),
+    // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
+    // See ORC-101
+    UTF8("utf8");
+
+    private final String id;
+    private BloomFilterVersion(String id) {
+      this.id = id;
+    }
+
+    public String toString() {
+      return id;
+    }
+
+    public static BloomFilterVersion fromString(String s) {
+      for (BloomFilterVersion version: values()) {
+        if (version.id.equals(s)) {
+          return version;
+        }
+      }
+      throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
+    }
+  }
+
   /**
    * Options for creating ORC file writers.
    */
@@ -253,6 +285,7 @@ public class OrcFile {
     private double paddingTolerance;
     private String bloomFilterColumns;
     private double bloomFilterFpp;
+    private BloomFilterVersion bloomFilterVersion;
 
     protected WriterOptions(Properties tableProperties, Configuration conf) {
       configuration = conf;
@@ -286,6 +319,10 @@ public class OrcFile {
           conf);
       bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
           conf);
+      bloomFilterVersion =
+          BloomFilterVersion.fromString(
+              OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties,
+                  conf));
     }
 
     /**
@@ -430,6 +467,14 @@ public class OrcFile {
     }
 
     /**
+     * Set the version of the bloom filters to write.
+     */
+    public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
+      this.bloomFilterVersion = version;
+      return this;
+    }
+
+    /**
      * A package local option to set the memory manager.
      */
     protected WriterOptions memory(MemoryManager value) {
@@ -508,6 +553,10 @@ public class OrcFile {
     public double getBloomFilterFpp() {
       return bloomFilterFpp;
     }
+
+    public BloomFilterVersion getBloomFilterVersion() {
+      return bloomFilterVersion;
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java 
b/java/core/src/java/org/apache/orc/TypeDescription.java
index da9fe49..bc6787d 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -842,4 +842,30 @@ public class TypeDescription
     printJsonToBuffer("", buffer, 0);
     return buffer.toString();
   }
+
+  /**
+   * Locate a subtype by its id.
+   * @param goal the column id to look for
+   * @return the subtype
+   */
+  public TypeDescription findSubtype(int goal) {
+    // call getId method to make sure the ids are assigned
+    int id = getId();
+    if (goal < id || goal > maxId) {
+      throw new IllegalArgumentException("Unknown type id " + id + " in " +
+          toJson());
+    }
+    if (goal == id) {
+      return this;
+    } else {
+      TypeDescription prev = null;
+      for(TypeDescription next: children) {
+        if (next.id > goal) {
+          return prev.findSubtype(goal);
+        }
+        prev = next;
+      }
+      return prev.findSubtype(goal);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git 
a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java 
b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
index 36b9a20..20e0faa 100644
--- a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -1408,7 +1408,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) {
       long longValue = longColVector.vector[elementNum];
       String string = anyIntegerAsLongTreeReader.getString(longValue);
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
     }
 
@@ -1450,7 +1450,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
       float floatValue = (float) doubleColVector.vector[elementNum];
       if (!Float.isNaN(floatValue)) {
         String string = String.valueOf(floatValue);
-        byte[] bytes = string.getBytes();
+        byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
         assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
       } else {
         bytesColVector.noNulls = false;
@@ -1495,7 +1495,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
       double doubleValue = doubleColVector.vector[elementNum];
       if (!Double.isNaN(doubleValue)) {
         String string = String.valueOf(doubleValue);
-        byte[] bytes = string.getBytes();
+        byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
         assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
       } else {
         bytesColVector.noNulls = false;
@@ -1544,7 +1544,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
     @Override
     public void setConvertVectorElement(int elementNum) {
       String string = 
decimalColVector.vector[elementNum].getHiveDecimal().toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
     }
 
@@ -1584,7 +1584,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) throws IOException {
       String string =
           timestampColVector.asScratchTimestamp(elementNum).toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
     }
 
@@ -1626,7 +1626,7 @@ public class ConvertTreeReaderFactory extends 
TreeReaderFactory {
     public void setConvertVectorElement(int elementNum) throws IOException {
       date.setTime(DateWritable.daysToMillis((int) 
longColVector.vector[elementNum]));
       String string = date.toString();
-      byte[] bytes = string.getBytes();
+      byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
       assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, 
bytes);
     }
 

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/OrcIndex.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/OrcIndex.java 
b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
index 50a15f2..edcb3ba 100644
--- a/java/core/src/java/org/apache/orc/impl/OrcIndex.java
+++ b/java/core/src/java/org/apache/orc/impl/OrcIndex.java
@@ -22,10 +22,14 @@ import org.apache.orc.OrcProto;
 
 public final class OrcIndex {
   OrcProto.RowIndex[] rowGroupIndex;
+  OrcProto.Stream.Kind[] bloomFilterKinds;
   OrcProto.BloomFilterIndex[] bloomFilterIndex;
 
-  public OrcIndex(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] 
bfIndex) {
+  public OrcIndex(OrcProto.RowIndex[] rgIndex,
+                  OrcProto.Stream.Kind[] bloomFilterKinds,
+                  OrcProto.BloomFilterIndex[] bfIndex) {
     this.rowGroupIndex = rgIndex;
+    this.bloomFilterKinds = bloomFilterKinds;
     this.bloomFilterIndex = bfIndex;
   }
 
@@ -37,6 +41,10 @@ public final class OrcIndex {
     return bloomFilterIndex;
   }
 
+  public OrcProto.Stream.Kind[] getBloomFilterKinds() {
+    return bloomFilterKinds;
+  }
+
   public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
     this.rowGroupIndex = rowGroupIndex;
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index e8ad54d..c7ce2bb 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -27,7 +27,9 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.OrcFile;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.BooleanColumnStatistics;
 import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionCodec;
@@ -88,10 +90,13 @@ public class RecordReaderImpl implements RecordReader {
   private final TreeReaderFactory.TreeReader reader;
   private final OrcProto.RowIndex[] indexes;
   private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+  private final OrcProto.Stream.Kind[] bloomFilterKind;
   private final SargApplier sargApp;
   // an array about which row groups aren't skipped
   private boolean[] includedRowGroups = null;
   private final DataReader dataReader;
+  private final boolean ignoreNonUtf8BloomFilter;
+  private final OrcFile.WriterVersion writerVersion;
 
   /**
    * Given a list of column names, find the given column and return the index.
@@ -134,6 +139,7 @@ public class RecordReaderImpl implements RecordReader {
   protected RecordReaderImpl(ReaderImpl fileReader,
                              Reader.Options options) throws IOException {
     this.included = options.getInclude();
+    this.writerVersion = fileReader.getWriterVersion();
     included[0] = true;
     if (options.getSchema() == null) {
       if (LOG.isInfoEnabled()) {
@@ -162,11 +168,14 @@ public class RecordReaderImpl implements RecordReader {
     this.types = fileReader.types;
     this.bufferSize = fileReader.bufferSize;
     this.rowIndexStride = fileReader.rowIndexStride;
+    this.ignoreNonUtf8BloomFilter =
+        OrcConf.IGNORE_NON_UTF8_BLOOM_FILTERS.getBoolean(fileReader.conf);
     SearchArgument sarg = options.getSearchArgument();
     if (sarg != null && rowIndexStride != 0) {
       sargApp = new SargApplier(sarg, options.getColumnNames(),
                                 rowIndexStride,
-                                included.length, evolution);
+                                included.length, evolution,
+                                writerVersion);
     } else {
       sargApp = null;
     }
@@ -218,6 +227,7 @@ public class RecordReaderImpl implements RecordReader {
     writerIncluded = evolution.getFileIncluded();
     indexes = new OrcProto.RowIndex[types.size()];
     bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+    bloomFilterKind = new OrcProto.Stream.Kind[types.size()];
     advanceToNextRow(reader, 0L, true);
   }
 
@@ -339,20 +349,23 @@ public class RecordReaderImpl implements RecordReader {
    * that is referenced in the predicate.
    * @param statsProto the statistics for the column mentioned in the predicate
    * @param predicate the leaf predicate we need to evaluation
-   * @param bloomFilter
+   * @param bloomFilter the bloom filter
+   * @param writerVersion the version of software that wrote the file
+   * @param type what is the kind of this column
    * @return the set of truth values that may be returned for the given
    *   predicate.
    */
   static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics 
statsProto,
-      PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
+                                           PredicateLeaf predicate,
+                                           OrcProto.Stream.Kind kind,
+                                           OrcProto.BloomFilter bloomFilter,
+                                           OrcFile.WriterVersion writerVersion,
+                                           TypeDescription.Category type) {
     ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
     Object minValue = getMin(cs);
     Object maxValue = getMax(cs);
-    BloomFilterIO bf = null;
-    if (bloomFilter != null) {
-      bf = new BloomFilterIO(bloomFilter);
-    }
-    return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), 
bf);
+    return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(),
+        BloomFilterIO.deserialize(kind, writerVersion, type, bloomFilter));
   }
 
   /**
@@ -365,14 +378,14 @@ public class RecordReaderImpl implements RecordReader {
    */
   public static TruthValue evaluatePredicate(ColumnStatistics stats,
                                              PredicateLeaf predicate,
-                                             BloomFilterIO bloomFilter) {
+                                             BloomFilter bloomFilter) {
     Object minValue = getMin(stats);
     Object maxValue = getMax(stats);
     return evaluatePredicateRange(predicate, minValue, maxValue, 
stats.hasNull(), bloomFilter);
   }
 
   static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
-      Object max, boolean hasNull, BloomFilterIO bloomFilter) {
+      Object max, boolean hasNull, BloomFilter bloomFilter) {
     // if we didn't have any values, everything must have been null
     if (min == null) {
       if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
@@ -421,7 +434,7 @@ public class RecordReaderImpl implements RecordReader {
   }
 
   private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
-      TruthValue result, BloomFilterIO bloomFilter) {
+      TruthValue result, BloomFilter bloomFilter) {
     // evaluate bloom filter only when
     // 1) Bloom filter is available
     // 2) Min/Max evaluation yield YES or MAYBE
@@ -531,7 +544,7 @@ public class RecordReaderImpl implements RecordReader {
   }
 
   private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf 
predicate,
-      final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
+      final Object predObj, BloomFilter bloomFilter, boolean hasNull) {
     switch (predicate.getOperator()) {
       case NULL_SAFE_EQUALS:
         // null safe equals does not return *_NULL variant. So set hasNull to 
false
@@ -553,7 +566,7 @@ public class RecordReaderImpl implements RecordReader {
     }
   }
 
-  private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object 
predObj, boolean hasNull) {
+  private static TruthValue checkInBloomFilter(BloomFilter bf, Object predObj, 
boolean hasNull) {
     TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
 
     if (predObj instanceof Long) {
@@ -708,6 +721,7 @@ public class RecordReaderImpl implements RecordReader {
     public final static boolean[] READ_ALL_RGS = null;
     public final static boolean[] READ_NO_RGS = new boolean[0];
 
+    private final OrcFile.WriterVersion writerVersion;
     private final SearchArgument sarg;
     private final List<PredicateLeaf> sargLeaves;
     private final int[] filterColumns;
@@ -716,10 +730,13 @@ public class RecordReaderImpl implements RecordReader {
     private final boolean[] sargColumns;
     private SchemaEvolution evolution;
 
-    public SargApplier(SearchArgument sarg, String[] columnNames,
+    public SargApplier(SearchArgument sarg,
+                       String[] columnNames,
                        long rowIndexStride,
                        int includedCount,
-                       SchemaEvolution evolution) {
+                       SchemaEvolution evolution,
+                       OrcFile.WriterVersion writerVersion) {
+      this.writerVersion = writerVersion;
       this.sarg = sarg;
       sargLeaves = sarg.getLeaves();
       filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves,
@@ -745,8 +762,11 @@ public class RecordReaderImpl implements RecordReader {
      * row groups must be read.
      * @throws IOException
      */
-    public boolean[] pickRowGroups(StripeInformation stripe, 
OrcProto.RowIndex[] indexes,
-        OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) 
throws IOException {
+    public boolean[] pickRowGroups(StripeInformation stripe,
+                                   OrcProto.RowIndex[] indexes,
+                                   OrcProto.Stream.Kind[] bloomFilterKinds,
+                                   OrcProto.BloomFilterIndex[] 
bloomFilterIndices,
+                                   boolean returnNone) throws IOException {
       long rowsInStripe = stripe.getNumberOfRows();
       int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / 
rowIndexStride);
       boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
@@ -765,11 +785,15 @@ public class RecordReaderImpl implements RecordReader {
             }
             OrcProto.ColumnStatistics stats = entry.getStatistics();
             OrcProto.BloomFilter bf = null;
+            OrcProto.Stream.Kind bfk = null;
             if (bloomFilterIndices != null && bloomFilterIndices[columnIx] != 
null) {
+              bfk = bloomFilterKinds[columnIx];
               bf = bloomFilterIndices[columnIx].getBloomFilter(rowGroup);
             }
             if (evolution != null && evolution.isPPDSafeConversion(columnIx)) {
-              leafValues[pred] = evaluatePredicateProto(stats, 
sargLeaves.get(pred), bf);
+              leafValues[pred] = evaluatePredicateProto(stats,
+                  sargLeaves.get(pred), bfk, bf, writerVersion,
+                  
evolution.getFileSchema().findSubtype(columnIx).getCategory());
             } else {
               leafValues[pred] = TruthValue.YES_NO_NULL;
             }
@@ -809,7 +833,8 @@ public class RecordReaderImpl implements RecordReader {
       return null;
     }
     readRowIndex(currentStripe, writerIncluded, sargApp.sargColumns);
-    return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, 
bloomFilterIndices, false);
+    return sargApp.pickRowGroups(stripes.get(currentStripe), indexes,
+        bloomFilterKind, bloomFilterIndices, false);
   }
 
   private void clearStreams() {
@@ -1168,8 +1193,9 @@ public class RecordReaderImpl implements RecordReader {
       sargColumns = sargColumns == null ?
           (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
     }
-    return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, 
sargColumns,
-        bloomFilterIndex);
+    return dataReader.readRowIndex(stripe, evolution.getFileType(0), 
stripeFooter,
+        ignoreNonUtf8BloomFilter, included, indexes, sargColumns, 
writerVersion,
+        bloomFilterKind, bloomFilterIndex);
   }
 
   private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int 
rowEntry)

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
index 3d57732..cadee35 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java
@@ -30,13 +30,13 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.io.DiskRange;
 import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.DataReader;
+import org.apache.orc.OrcFile;
 import org.apache.orc.OrcProto;
 
 import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
 
 /**
  * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
@@ -44,6 +44,100 @@ import org.apache.orc.StripeInformation;
 public class RecordReaderUtils {
   private static final HadoopShims SHIMS = HadoopShims.Factory.get();
 
+  static boolean hadBadBloomFilters(TypeDescription.Category category,
+                                    OrcFile.WriterVersion version) {
+    switch(category) {
+      case STRING:
+      case CHAR:
+      case VARCHAR:
+        return !version.includes(OrcFile.WriterVersion.HIVE_12055);
+      case DECIMAL:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  /**
+   * Plans the list of disk ranges that the given stripe needs to read the
+   * indexes. All of the positions are relative to the start of the stripe.
+   * @param  fileSchema the schema for the file
+   * @param footer the stripe footer
+   * @param ignoreNonUtf8BloomFilter should the reader ignore non-utf8
+   *                                 encoded bloom filters
+   * @param fileIncluded the columns (indexed by file columns) that should be
+   *                     read
+   * @param sargColumns true for the columns (indexed by file columns) that
+   *                    we need bloom filters for
+   * @param version the version of the software that wrote the file
+   * @param bloomFilterKinds (output) the stream kind of the bloom filters
+   * @return a list of merged disk ranges to read
+   */
+  static DiskRangeList planIndexReading(TypeDescription fileSchema,
+                                        OrcProto.StripeFooter footer,
+                                        boolean ignoreNonUtf8BloomFilter,
+                                        boolean[] fileIncluded,
+                                        boolean[] sargColumns,
+                                        OrcFile.WriterVersion version,
+                                        OrcProto.Stream.Kind[] 
bloomFilterKinds) {
+    DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
+    List<OrcProto.Stream> streams = footer.getStreamsList();
+    // figure out which kind of bloom filter we want for each column
+    // picks bloom_filter_utf8 if its available, otherwise bloom_filter
+    if (sargColumns != null) {
+      for (OrcProto.Stream stream : streams) {
+        if (stream.hasKind() && stream.hasColumn()) {
+          int column = stream.getColumn();
+          if (sargColumns[column]) {
+            switch (stream.getKind()) {
+              case BLOOM_FILTER:
+                if (bloomFilterKinds[column] == null &&
+                    !(ignoreNonUtf8BloomFilter &&
+                        hadBadBloomFilters(fileSchema.findSubtype(column)
+                            .getCategory(), version))) {
+                  bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
+                }
+                break;
+              case BLOOM_FILTER_UTF8:
+                bloomFilterKinds[column] = 
OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+                break;
+              default:
+                break;
+            }
+          }
+        }
+      }
+    }
+    long offset = 0;
+    for(OrcProto.Stream stream: footer.getStreamsList()) {
+      if (stream.hasKind() && stream.hasColumn()) {
+        int column = stream.getColumn();
+        if (fileIncluded == null || fileIncluded[column]) {
+          boolean needStream = false;
+          switch (stream.getKind()) {
+            case ROW_INDEX:
+              needStream = true;
+              break;
+            case BLOOM_FILTER:
+              needStream = bloomFilterKinds[column] == 
OrcProto.Stream.Kind.BLOOM_FILTER;
+              break;
+            case BLOOM_FILTER_UTF8:
+              needStream = bloomFilterKinds[column] == 
OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
+              break;
+            default:
+              // PASS
+              break;
+          }
+          if (needStream) {
+            result.addOrMerge(offset, offset + stream.getLength(), true, 
false);
+          }
+        }
+      }
+      offset += stream.getLength();
+    }
+    return result.get();
+  }
+
   private static class DefaultDataReader implements DataReader {
     private FSDataInputStream file = null;
     private final ByteBufferAllocatorPool pool;
@@ -91,10 +185,14 @@ public class RecordReaderUtils {
 
     @Override
     public OrcIndex readRowIndex(StripeInformation stripe,
+                                 TypeDescription fileSchema,
                                  OrcProto.StripeFooter footer,
+                                 boolean ignoreNonUtf8BloomFilter,
                                  boolean[] included,
                                  OrcProto.RowIndex[] indexes,
                                  boolean[] sargColumns,
+                                 OrcFile.WriterVersion version,
+                                 OrcProto.Stream.Kind[] bloomFilterKinds,
                                  OrcProto.BloomFilterIndex[] bloomFilterIndices
                                  ) throws IOException {
       if (file == null) {
@@ -106,49 +204,61 @@ public class RecordReaderUtils {
       if (indexes == null) {
         indexes = new OrcProto.RowIndex[typeCount];
       }
+      if (bloomFilterKinds == null) {
+        bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
+      }
       if (bloomFilterIndices == null) {
         bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
       }
-      long offset = stripe.getOffset();
-      List<OrcProto.Stream> streams = footer.getStreamsList();
-      for (int i = 0; i < streams.size(); i++) {
-        OrcProto.Stream stream = streams.get(i);
-        OrcProto.Stream nextStream = null;
-        if (i < streams.size() - 1) {
-          nextStream = streams.get(i+1);
+      DiskRangeList ranges = planIndexReading(fileSchema, footer,
+          ignoreNonUtf8BloomFilter, included, sargColumns, version,
+          bloomFilterKinds);
+      ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
+      long offset = 0;
+      DiskRangeList range = ranges;
+      for(OrcProto.Stream stream: footer.getStreamsList()) {
+        // advance to find the next range
+        while (range != null && range.getEnd() <= offset) {
+          range = range.next;
         }
-        int col = stream.getColumn();
-        int len = (int) stream.getLength();
-        // row index stream and bloom filter are interlaced, check if the sarg 
column contains bloom
-        // filter and combine the io to read row index and bloom filters for 
that column together
-        if (stream.hasKind() && (stream.getKind() == 
OrcProto.Stream.Kind.ROW_INDEX)) {
-          boolean readBloomFilter = false;
-          if (sargColumns != null && sargColumns[col] &&
-              nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
-            len += nextStream.getLength();
-            i += 1;
-            readBloomFilter = true;
-          }
-          if ((included == null || included[col]) && indexes[col] == null) {
-            byte[] buffer = new byte[len];
-            file.readFully(offset, buffer, 0, buffer.length);
-            ByteBuffer bb = ByteBuffer.wrap(buffer);
-            indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
-                ReaderImpl.singleton(new BufferChunk(bb, 0)), 
stream.getLength(),
-                codec, bufferSize));
-            if (readBloomFilter) {
-              bb.position((int) stream.getLength());
-              bloomFilterIndices[col] = 
OrcProto.BloomFilterIndex.parseFrom(InStream.create(
-                  "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
-                  nextStream.getLength(), codec, bufferSize));
-            }
+        // no more ranges, so we are done
+        if (range == null) {
+          break;
+        }
+        int column = stream.getColumn();
+        if (stream.hasKind() && range.getOffset() <= offset) {
+          switch (stream.getKind()) {
+            case ROW_INDEX:
+              if (included == null || included[column]) {
+                ByteBuffer bb = range.getData().duplicate();
+                bb.position((int) (offset - range.getOffset()));
+                bb.limit((int) (bb.position() + stream.getLength()));
+                indexes[column] = OrcProto.RowIndex.parseFrom(
+                    InStream.createCodedInputStream("index",
+                        ReaderImpl.singleton(new BufferChunk(bb, 0)),
+                        stream.getLength(),
+                    codec, bufferSize));
+              }
+              break;
+            case BLOOM_FILTER:
+            case BLOOM_FILTER_UTF8:
+              if (sargColumns != null && sargColumns[column]) {
+                ByteBuffer bb = range.getData().duplicate();
+                bb.position((int) (offset - range.getOffset()));
+                bb.limit((int) (bb.position() + stream.getLength()));
+                bloomFilterIndices[column] = 
OrcProto.BloomFilterIndex.parseFrom
+                    (InStream.createCodedInputStream("bloom_filter",
+                        ReaderImpl.singleton(new BufferChunk(bb, 0)),
+                    stream.getLength(), codec, bufferSize));
+              }
+              break;
+            default:
+              break;
           }
         }
-        offset += len;
+        offset += stream.getLength();
       }
-
-      OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
-      return index;
+      return new OrcIndex(indexes, bloomFilterKinds, bloomFilterIndices);
     }
 
     @Override
@@ -234,14 +344,14 @@ public class RecordReaderUtils {
   }
 
   public static void addEntireStreamToRanges(
-      long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+      long offset, long length, DiskRangeList.CreateHelper list, boolean 
doMergeBuffers) {
     list.addOrMerge(offset, offset + length, doMergeBuffers, false);
   }
 
   public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
       boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex 
index,
       OrcProto.ColumnEncoding encoding, OrcProto.Type type, int 
compressionSize, boolean hasNull,
-      long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+      long offset, long length, DiskRangeList.CreateHelper list, boolean 
doMergeBuffers) {
     for (int group = 0; group < includedRowGroups.length; ++group) {
       if (!includedRowGroups[group]) continue;
       int posn = getIndexPosition(
@@ -399,7 +509,7 @@ public class RecordReaderUtils {
     if (range == null) return null;
     DiskRangeList prev = range.prev;
     if (prev == null) {
-      prev = new MutateHelper(range);
+      prev = new DiskRangeList.MutateHelper(range);
     }
     while (range != null) {
       if (range.hasData()) {

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java 
b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 1e11728..20adfd8 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -153,6 +153,10 @@ public class SchemaEvolution {
     return hasConversion;
   }
 
+  public TypeDescription getFileSchema() {
+    return fileSchema;
+  }
+
   public TypeDescription getFileType(TypeDescription readerType) {
     return getFileType(readerType.getId());
   }

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/StreamName.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/StreamName.java 
b/java/core/src/java/org/apache/orc/impl/StreamName.java
index b3fd145..e3561bf 100644
--- a/java/core/src/java/org/apache/orc/impl/StreamName.java
+++ b/java/core/src/java/org/apache/orc/impl/StreamName.java
@@ -78,6 +78,7 @@ public class StreamName implements Comparable<StreamName> {
       case ROW_INDEX:
       case DICTIONARY_COUNT:
       case BLOOM_FILTER:
+      case BLOOM_FILTER_UTF8:
         return Area.INDEX;
       default:
         return Area.DATA;

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java 
b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index 3df1b76..940ef59 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -21,6 +21,7 @@ package org.apache.orc.impl;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.sql.Timestamp;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -34,12 +35,10 @@ import io.airlift.compress.lz4.Lz4Compressor;
 import io.airlift.compress.lz4.Lz4Decompressor;
 import io.airlift.compress.lzo.LzoCompressor;
 import io.airlift.compress.lzo.LzoDecompressor;
-import io.airlift.compress.snappy.SnappyCompressor;
-import io.airlift.compress.snappy.SnappyDecompressor;
-import org.apache.commons.lang.ArrayUtils;
 import org.apache.hadoop.hive.ql.util.JavaDataModel;
 import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BloomFilterIO;
+import org.apache.orc.util.BloomFilter;
+import org.apache.orc.util.BloomFilterIO;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcConf;
@@ -50,6 +49,7 @@ import org.apache.orc.StringColumnStatistics;
 import org.apache.orc.StripeInformation;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
+import org.apache.orc.util.BloomFilterUtf8;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -147,6 +147,7 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
   private final OrcFile.CompressionStrategy compressionStrategy;
   private final boolean[] bloomFilterColumns;
   private final double bloomFilterFpp;
+  private final OrcFile.BloomFilterVersion bloomFilterVersion;
   private boolean writeTimeZone;
 
   public WriterImpl(FileSystem fs,
@@ -157,6 +158,7 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
     this.conf = opts.getConfiguration();
     this.callback = opts.getCallback();
     this.schema = opts.getSchema();
+    bloomFilterVersion = opts.getBloomFilterVersion();
     if (callback != null) {
       callbackContext = new OrcFile.WriterContext(){
 
@@ -426,6 +428,7 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
         case BLOOM_FILTER:
         case DATA:
         case DICTIONARY_DATA:
+        case BLOOM_FILTER_UTF8:
           if (getCompressionStrategy() == OrcFile.CompressionStrategy.SPEED) {
             modifiers = EnumSet.of(CompressionCodec.Modifier.FAST,
                 CompressionCodec.Modifier.TEXT);
@@ -543,6 +546,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
     public boolean hasWriterTimeZone() {
       return writeTimeZone;
     }
+
+    public OrcFile.BloomFilterVersion getBloomFilterVersion() {
+      return bloomFilterVersion;
+    }
   }
 
   /**
@@ -564,9 +571,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
     private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
     private final PositionedOutputStream rowIndexStream;
     private final PositionedOutputStream bloomFilterStream;
-    protected final BloomFilterIO bloomFilter;
+    private final PositionedOutputStream bloomFilterStreamUtf8;
+    protected final BloomFilter bloomFilter;
+    protected final BloomFilterUtf8 bloomFilterUtf8;
     protected final boolean createBloomFilter;
     private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
+    private final OrcProto.BloomFilterIndex.Builder bloomFilterIndexUtf8;
     private final OrcProto.BloomFilter.Builder bloomFilterEntry;
     private boolean foundNulls;
     private OutStream isPresentOutStream;
@@ -612,15 +622,30 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
       }
       if (createBloomFilter) {
         bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
-        bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
-        bloomFilterStream = streamFactory.createStream(id, 
OrcProto.Stream.Kind.BLOOM_FILTER);
-        bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(),
+        if (streamFactory.getBloomFilterVersion() == 
OrcFile.BloomFilterVersion.ORIGINAL) {
+          bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(),
+              streamFactory.getBloomFilterFPP());
+          bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
+          bloomFilterStream = streamFactory.createStream(id,
+              OrcProto.Stream.Kind.BLOOM_FILTER);;
+        } else {
+          bloomFilter = null;
+          bloomFilterIndex = null;
+          bloomFilterStream = null;
+        }
+        bloomFilterUtf8 = new 
BloomFilterUtf8(streamFactory.getRowIndexStride(),
             streamFactory.getBloomFilterFPP());
+        bloomFilterIndexUtf8 = OrcProto.BloomFilterIndex.newBuilder();
+        bloomFilterStreamUtf8 = streamFactory.createStream(id,
+              OrcProto.Stream.Kind.BLOOM_FILTER_UTF8);;
       } else {
         bloomFilterEntry = null;
         bloomFilterIndex = null;
+        bloomFilterIndexUtf8 = null;
+        bloomFilterStreamUtf8 = null;
         bloomFilterStream = null;
         bloomFilter = null;
+        bloomFilterUtf8 = null;
       }
     }
 
@@ -788,7 +813,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
         bloomFilterIndex.build().writeTo(bloomFilterStream);
         bloomFilterStream.flush();
         bloomFilterIndex.clear();
-        bloomFilterEntry.clear();
+      }
+      // write the bloom filter to out stream
+      if (bloomFilterStreamUtf8 != null) {
+        bloomFilterIndexUtf8.build().writeTo(bloomFilterStreamUtf8);
+        bloomFilterStreamUtf8.flush();
+        bloomFilterIndexUtf8.clear();
       }
     }
 
@@ -837,12 +867,16 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
 
     void addBloomFilterEntry() {
       if (createBloomFilter) {
-        
bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions());
-        bloomFilterEntry.addAllBitset(Arrays.asList(ArrayUtils.toObject(
-            bloomFilter.getBitSet())));
-        bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
-        bloomFilter.reset();
-        bloomFilterEntry.clear();
+        if (bloomFilter != null) {
+          BloomFilterIO.serialize(bloomFilterEntry, bloomFilter);
+          bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
+          bloomFilter.reset();
+        }
+        if (bloomFilterUtf8 != null) {
+          BloomFilterIO.serialize(bloomFilterEntry, bloomFilterUtf8);
+          bloomFilterIndexUtf8.addBloomFilter(bloomFilterEntry.build());
+          bloomFilterUtf8.reset();
+        }
       }
     }
 
@@ -946,7 +980,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           byte value = (byte) vec.vector[0];
           indexStatistics.updateInteger(value, length);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -959,7 +996,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateInteger(value, 1);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1017,7 +1057,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           long value = vec.vector[0];
           indexStatistics.updateInteger(value, length);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -1030,7 +1073,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateInteger(value, 1);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1077,7 +1123,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           float value = (float) vec.vector[0];
           indexStatistics.updateDouble(value);
           if (createBloomFilter) {
-            bloomFilter.addDouble(value);
+            if (bloomFilter != null) {
+              bloomFilter.addDouble(value);
+            }
+            bloomFilterUtf8.addDouble(value);
           }
           for(int i=0; i < length; ++i) {
             utils.writeFloat(stream, value);
@@ -1090,7 +1139,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             utils.writeFloat(stream, value);
             indexStatistics.updateDouble(value);
             if (createBloomFilter) {
-              bloomFilter.addDouble(value);
+              if (bloomFilter != null) {
+                bloomFilter.addDouble(value);
+              }
+              bloomFilterUtf8.addDouble(value);
             }
           }
         }
@@ -1138,7 +1190,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           double value = vec.vector[0];
           indexStatistics.updateDouble(value);
           if (createBloomFilter) {
-            bloomFilter.addDouble(value);
+            if (bloomFilter != null) {
+              bloomFilter.addDouble(value);
+            }
+            bloomFilterUtf8.addDouble(value);
           }
           for(int i=0; i < length; ++i) {
             utils.writeDouble(stream, value);
@@ -1151,7 +1206,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             utils.writeDouble(stream, value);
             indexStatistics.updateDouble(value);
             if (createBloomFilter) {
-              bloomFilter.addDouble(value);
+              if (bloomFilter != null) {
+                bloomFilter.addDouble(value);
+              }
+              bloomFilterUtf8.addDouble(value);
             }
           }
         }
@@ -1430,7 +1488,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           indexStatistics.updateString(vec.vector[0], vec.start[0],
               vec.length[0], length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+                  vec.length[0], StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], 
vec.length[0]);
           }
         }
       } else {
@@ -1447,7 +1510,13 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             indexStatistics.updateString(vec.vector[offset + i],
                 vec.start[offset + i], vec.length[offset + i], 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i],
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], vec.length[offset + i]);
             }
           }
@@ -1504,7 +1573,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           }
           indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0], vec.start[0],
+                  vec.length[0], StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], 
vec.length[0]);
           }
         }
       } else {
@@ -1531,7 +1605,14 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             }
             indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i],
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
+                  vec.start[offset + i], vec.length[offset + i]);
             }
           }
         }
@@ -1576,7 +1657,14 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           indexStatistics.updateString(vec.vector[0], vec.start[0],
               itemLength, length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], itemLength);
+            if (bloomFilter != null) {
+              // translate from UTF-8 to the default charset
+              bloomFilter.addString(new String(vec.vector[0],
+                  vec.start[0], itemLength,
+                  StandardCharsets.UTF_8));
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0],
+                vec.start[0], itemLength);
           }
         }
       } else {
@@ -1594,7 +1682,13 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             indexStatistics.updateString(vec.vector[offset + i],
                 vec.start[offset + i], itemLength, 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                // translate from UTF-8 to the default charset
+                bloomFilter.addString(new String(vec.vector[offset + i],
+                    vec.start[offset + i], itemLength,
+                    StandardCharsets.UTF_8));
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], itemLength);
             }
           }
@@ -1646,7 +1740,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           indexStatistics.updateBinary(vec.vector[0], vec.start[0],
               vec.length[0], length);
           if (createBloomFilter) {
-            bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            if (bloomFilter != null) {
+              bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+            }
+            bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], 
vec.length[0]);
           }
         }
       } else {
@@ -1658,7 +1755,11 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             indexStatistics.updateBinary(vec.vector[offset + i],
                 vec.start[offset + i], vec.length[offset + i], 1);
             if (createBloomFilter) {
-              bloomFilter.addBytes(vec.vector[offset + i],
+              if (bloomFilter != null) {
+                bloomFilter.addBytes(vec.vector[offset + i],
+                    vec.start[offset + i], vec.length[offset + i]);
+              }
+              bloomFilterUtf8.addBytes(vec.vector[offset + i],
                   vec.start[offset + i], vec.length[offset + i]);
             }
           }
@@ -1734,7 +1835,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           long millis = val.getTime();
           indexStatistics.updateTimestamp(millis);
           if (createBloomFilter) {
-            bloomFilter.addLong(millis);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(millis);
+            }
+            bloomFilterUtf8.addLong(millis);
           }
           final long secs = millis / MILLIS_PER_SECOND - base_timestamp;
           final long nano = formatNanos(val.getNanos());
@@ -1753,7 +1857,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             nanos.write(formatNanos(val.getNanos()));
             indexStatistics.updateTimestamp(millis);
             if (createBloomFilter) {
-              bloomFilter.addLong(millis);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(millis);
+              }
+              bloomFilterUtf8.addLong(millis);
             }
           }
         }
@@ -1819,7 +1926,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           int value = (int) vec.vector[0];
           indexStatistics.updateDate(value);
           if (createBloomFilter) {
-            bloomFilter.addLong(value);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(value);
+            }
+            bloomFilterUtf8.addLong(value);
           }
           for(int i=0; i < length; ++i) {
             writer.write(value);
@@ -1832,7 +1942,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             writer.write(value);
             indexStatistics.updateDate(value);
             if (createBloomFilter) {
-              bloomFilter.addLong(value);
+              if (bloomFilter != null) {
+                bloomFilter.addLong(value);
+              }
+              bloomFilterUtf8.addLong(value);
             }
           }
         }
@@ -1901,7 +2014,11 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
           HiveDecimal value = vec.vector[0].getHiveDecimal();
           indexStatistics.updateDecimal(value);
           if (createBloomFilter) {
-            bloomFilter.addString(value.toString());
+            String str = value.toString();
+            if (bloomFilter != null) {
+              bloomFilter.addString(str);
+            }
+            bloomFilterUtf8.addString(str);
           }
           for(int i=0; i < length; ++i) {
             SerializationUtils.writeBigInteger(valueStream,
@@ -1918,7 +2035,11 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             scaleStream.write(value.scale());
             indexStatistics.updateDecimal(value);
             if (createBloomFilter) {
-              bloomFilter.addString(value.toString());
+              String str = value.toString();
+              if (bloomFilter != null) {
+                bloomFilter.addString(str);
+              }
+              bloomFilterUtf8.addString(str);
             }
           }
         }
@@ -2065,7 +2186,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             childrenWriters[0].writeBatch(vec.child, childOffset, childLength);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(childLength);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(childLength);
+            }
+            bloomFilterUtf8.addLong(childLength);
           }
         }
       } else {
@@ -2088,6 +2212,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             } else {
               currentLength += nextLength;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(nextLength);
+              }
+              bloomFilterUtf8.addLong(nextLength);
+            }
           }
         }
         if (currentLength != 0) {
@@ -2161,7 +2291,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             childrenWriters[1].writeBatch(vec.values, childOffset, 
childLength);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(childLength);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(childLength);
+            }
+            bloomFilterUtf8.addLong(childLength);
           }
         }
       } else {
@@ -2186,6 +2319,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             } else {
               currentLength += nextLength;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(nextLength);
+              }
+              bloomFilterUtf8.addLong(nextLength);
+            }
           }
         }
         if (currentLength != 0) {
@@ -2247,7 +2386,10 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
             tags.write(tag);
           }
           if (createBloomFilter) {
-            bloomFilter.addLong(tag);
+            if (bloomFilter != null) {
+              bloomFilter.addLong(tag);
+            }
+            bloomFilterUtf8.addLong(tag);
           }
           childrenWriters[tag].writeBatch(vec.fields[tag], offset, length);
         }
@@ -2275,6 +2417,12 @@ public class WriterImpl implements Writer, 
MemoryManager.Callback {
               currentStart[tag] = i + offset;
               currentLength[tag] = 1;
             }
+            if (createBloomFilter) {
+              if (bloomFilter != null) {
+                bloomFilter.addLong(tag);
+              }
+              bloomFilterUtf8.addLong(tag);
+            }
           }
         }
         // write out any left over sequences

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/util/BloomFilter.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilter.java 
b/java/core/src/java/org/apache/orc/util/BloomFilter.java
new file mode 100644
index 0000000..0d9db24
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilter.java
@@ -0,0 +1,328 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * BloomFilter is a probabilistic data structure for set membership check. 
BloomFilters are
+ * highly space efficient when compared to using a HashSet. Because of the 
probabilistic nature of
+ * bloom filter false positive (element not present in bloom filter but test() 
says true) are
+ * possible but false negatives are not possible (if element is present then 
test() will never
+ * say false). The false positive probability is configurable (default: 5%) 
depending on which
+ * storage requirement may increase or decrease. Lower the false positive 
probability greater
+ * is the space requirement.
+ * Bloom filters are sensitive to number of elements that will be inserted in 
the bloom filter.
+ * During the creation of bloom filter expected number of entries must be 
specified. If the number
+ * of insertions exceed the specified initial number of entries then false 
positive probability will
+ * increase accordingly.
+ *
+ * Internally, this implementation of bloom filter uses Murmur3 fast 
non-cryptographic hash
+ * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it 
suffers from hash
+ * collisions for specific sequence of repeating bytes. Check the following 
link for more info
+ * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
+ *
+ * Note that this class is here for backwards compatibility, because it uses
+ * the JVM default character set for strings. All new users should
+ * BloomFilterUtf8, which always uses UTF8 for the encoding.
+ */
+public class BloomFilter {
+  public static final double DEFAULT_FPP = 0.05;
+  private final BitSet bitSet;
+  private final int numBits;
+  private final int numHashFunctions;
+
+  static void checkArgument(boolean expression, String message) {
+    if (!expression) {
+      throw new IllegalArgumentException(message);
+    }
+  }
+
+  public BloomFilter(long expectedEntries) {
+    this(expectedEntries, DEFAULT_FPP);
+  }
+
+  public BloomFilter(long expectedEntries, double fpp) {
+    checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
+    checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should 
be > 0.0 & < 1.0");
+    int nb = optimalNumOfBits(expectedEntries, fpp);
+    // make 'm' multiple of 64
+    this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
+    this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, 
numBits);
+    this.bitSet = new BitSet(numBits);
+  }
+
+  /**
+   * A constructor to support rebuilding the BloomFilter from a serialized 
representation.
+   * @param bits the serialized bits
+   * @param numFuncs the number of functions used
+   */
+  public BloomFilter(long[] bits, int numFuncs) {
+    super();
+    bitSet = new BitSet(bits);
+    this.numBits = (int) bitSet.bitSize();
+    numHashFunctions = numFuncs;
+  }
+
+  static int optimalNumOfHashFunctions(long n, long m) {
+    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
+  }
+
+  static int optimalNumOfBits(long n, double p) {
+    return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return other != null &&
+        other.getClass() == getClass() &&
+        numBits == ((BloomFilter) other).numBits &&
+        numHashFunctions == ((BloomFilter) other).numHashFunctions &&
+        bitSet.equals(((BloomFilter) other).bitSet);
+  }
+
+  public void add(byte[] val) {
+    if (val == null) {
+      addBytes(val, -1, -1);
+    } else {
+      addBytes(val, 0, val.length);
+    }
+  }
+
+  public void addBytes(byte[] val, int offset, int length) {
+    // We use the trick mentioned in "Less Hashing, Same Performance: Building 
a Better Bloom Filter"
+    // by Kirsch et.al. From abstract 'only two hash functions are necessary 
to effectively
+    // implement a Bloom filter without any loss in the asymptotic false 
positive probability'
+
+    // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the 
technique mentioned
+    // in the above paper
+    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+        Murmur3.hash64(val, offset, length);
+    addHash(hash64);
+  }
+
+  private void addHash(long hash64) {
+    int hash1 = (int) hash64;
+    int hash2 = (int) (hash64 >>> 32);
+
+    for (int i = 1; i <= numHashFunctions; i++) {
+      int combinedHash = hash1 + (i * hash2);
+      // hashcode should be positive, flip all the bits if it's negative
+      if (combinedHash < 0) {
+        combinedHash = ~combinedHash;
+      }
+      int pos = combinedHash % numBits;
+      bitSet.set(pos);
+    }
+  }
+
+  public void addString(String val) {
+    if (val == null) {
+      add(null);
+    } else {
+      add(val.getBytes(Charset.defaultCharset()));
+    }
+  }
+
+  public void addLong(long val) {
+    addHash(getLongHash(val));
+  }
+
+  public void addDouble(double val) {
+    addLong(Double.doubleToLongBits(val));
+  }
+
+  public boolean test(byte[] val) {
+    if (val == null) {
+      return testBytes(val, -1, -1);
+    }
+    return testBytes(val, 0, val.length);
+  }
+
+  public boolean testBytes(byte[] val, int offset, int length) {
+    long hash64 = val == null ? Murmur3.NULL_HASHCODE :
+        Murmur3.hash64(val, offset, length);
+    return testHash(hash64);
+  }
+
+  private boolean testHash(long hash64) {
+    int hash1 = (int) hash64;
+    int hash2 = (int) (hash64 >>> 32);
+
+    for (int i = 1; i <= numHashFunctions; i++) {
+      int combinedHash = hash1 + (i * hash2);
+      // hashcode should be positive, flip all the bits if it's negative
+      if (combinedHash < 0) {
+        combinedHash = ~combinedHash;
+      }
+      int pos = combinedHash % numBits;
+      if (!bitSet.get(pos)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  public boolean testString(String val) {
+    if (val == null) {
+      return test(null);
+    } else {
+      return test(val.getBytes(Charset.defaultCharset()));
+    }
+  }
+
+  public boolean testLong(long val) {
+    return testHash(getLongHash(val));
+  }
+
+  // Thomas Wang's integer hash function
+  // 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
+  private long getLongHash(long key) {
+    key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+    key = key ^ (key >> 24);
+    key = (key + (key << 3)) + (key << 8); // key * 265
+    key = key ^ (key >> 14);
+    key = (key + (key << 2)) + (key << 4); // key * 21
+    key = key ^ (key >> 28);
+    key = key + (key << 31);
+    return key;
+  }
+
+  public boolean testDouble(double val) {
+    return testLong(Double.doubleToLongBits(val));
+  }
+
+  public long sizeInBytes() {
+    return getBitSize() / 8;
+  }
+
+  public int getBitSize() {
+    return bitSet.getData().length * Long.SIZE;
+  }
+
+  public int getNumHashFunctions() {
+    return numHashFunctions;
+  }
+
+  public long[] getBitSet() {
+    return bitSet.getData();
+  }
+
+  @Override
+  public String toString() {
+    return "m: " + numBits + " k: " + numHashFunctions;
+  }
+
+  /**
+   * Merge the specified bloom filter with current bloom filter.
+   *
+   * @param that - bloom filter to merge
+   */
+  public void merge(BloomFilter that) {
+    if (this != that && this.numBits == that.numBits && this.numHashFunctions 
== that.numHashFunctions) {
+      this.bitSet.putAll(that.bitSet);
+    } else {
+      throw new IllegalArgumentException("BloomFilters are not compatible for 
merging." +
+          " this - " + this.toString() + " that - " + that.toString());
+    }
+  }
+
+  public void reset() {
+    this.bitSet.clear();
+  }
+
+  /**
+   * Bare metal bit set implementation. For performance reasons, this 
implementation does not check
+   * for index bounds nor expand the bit set size if the specified index is 
greater than the size.
+   */
+  public static class BitSet {
+    private final long[] data;
+
+    public BitSet(long bits) {
+      this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
+    }
+
+    /**
+     * Deserialize long array as bit set.
+     *
+     * @param data - bit array
+     */
+    public BitSet(long[] data) {
+      assert data.length > 0 : "data length is zero!";
+      this.data = data;
+    }
+
+    /**
+     * Sets the bit at specified index.
+     *
+     * @param index - position
+     */
+    public void set(int index) {
+      data[index >>> 6] |= (1L << index);
+    }
+
+    /**
+     * Returns true if the bit is set in the specified index.
+     *
+     * @param index - position
+     * @return - value at the bit position
+     */
+    public boolean get(int index) {
+      return (data[index >>> 6] & (1L << index)) != 0;
+    }
+
+    /**
+     * Number of bits
+     */
+    public long bitSize() {
+      return (long) data.length * Long.SIZE;
+    }
+
+    public long[] getData() {
+      return data;
+    }
+
+    /**
+     * Combines the two BitArrays using bitwise OR.
+     */
+    public void putAll(BitSet array) {
+      assert data.length == array.data.length :
+          "BitArrays must be of equal length (" + data.length + "!= " + 
array.data.length + ")";
+      for (int i = 0; i < data.length; i++) {
+        data[i] |= array.data[i];
+      }
+    }
+
+    /**
+     * Clear the bit set.
+     */
+    public void clear() {
+      Arrays.fill(data, 0);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      return other != null &&
+          other.getClass() == getClass() &&
+          Arrays.equals(data, ((BitSet) other).data);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterIO.java 
b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
new file mode 100644
index 0000000..a6c3940
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterIO.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import com.google.protobuf.ByteString;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+public class BloomFilterIO  {
+
+  private BloomFilterIO() {
+    // never called
+  }
+
+  /**
+   * Deserialize a bloom filter from the ORC file.
+   */
+  public static BloomFilter deserialize(OrcProto.Stream.Kind kind,
+                                        OrcFile.WriterVersion fileVersion,
+                                        TypeDescription.Category type,
+                                        OrcProto.BloomFilter bloomFilter) {
+    if (bloomFilter == null) {
+      return null;
+    }
+    int numFuncs = bloomFilter.getNumHashFunctions();
+    switch (kind) {
+      case BLOOM_FILTER: {
+        long values[] = new long[bloomFilter.getBitsetCount()];
+        for (int i = 0; i < values.length; ++i) {
+          values[i] = bloomFilter.getBitset(i);
+        }
+        // After HIVE-12055 the bloom filters for strings correctly use
+        // UTF8.
+        if (fileVersion.includes(OrcFile.WriterVersion.HIVE_12055) &&
+            (type == TypeDescription.Category.STRING ||
+             type == TypeDescription.Category.CHAR ||
+             type == TypeDescription.Category.VARCHAR)) {
+          return new BloomFilterUtf8(values, numFuncs);
+        }
+        return new BloomFilter(values, numFuncs);
+      }
+      case BLOOM_FILTER_UTF8: {
+        ByteString bits = bloomFilter.getUtf8Bitset();
+        long[] values = new long[bits.size() / 8];
+        bits.asReadOnlyByteBuffer().order(ByteOrder.LITTLE_ENDIAN)
+            .asLongBuffer().get(values);
+        return new BloomFilterUtf8(values, numFuncs);
+      }
+      default:
+        throw new IllegalArgumentException("Unknown bloom filter kind " + 
kind);
+    }
+  }
+
+  /**
+   * Serialize the BloomFilter to the ORC file.
+   * @param builder the builder to write to
+   * @param bloomFilter the bloom filter to serialize
+   */
+  public static void serialize(OrcProto.BloomFilter.Builder builder,
+                               BloomFilter bloomFilter) {
+    builder.clear();
+    builder.setNumHashFunctions(bloomFilter.getNumHashFunctions());
+    long[] bitset = bloomFilter.getBitSet();
+    if (bloomFilter instanceof BloomFilterUtf8) {
+      ByteBuffer buffer = ByteBuffer.allocate(bitset.length * 8);
+      buffer.order(ByteOrder.LITTLE_ENDIAN);
+      buffer.asLongBuffer().put(bitset);
+      builder.setUtf8Bitset(ByteString.copyFrom(buffer));
+    } else {
+      for(int i=0; i < bitset.length; ++i) {
+        builder.addBitset(bitset[i]);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java 
b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
new file mode 100644
index 0000000..aad4fab
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/util/BloomFilterUtf8.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * This class represents the fix from ORC-101 where we fixed the bloom filter
+ * from using the JVM's default character set to always using UTF-8.
+ */
+public class BloomFilterUtf8 extends BloomFilter {
+
+  public BloomFilterUtf8(long expectedEntries, double fpp) {
+    super(expectedEntries, fpp);
+  }
+
+  public BloomFilterUtf8(long[] bits, int numFuncs) {
+    super(bits, numFuncs);
+  }
+
+
+  public void addString(String val) {
+    if (val == null) {
+      add(null);
+    } else {
+      add(val.getBytes(StandardCharsets.UTF_8));
+    }
+  }
+
+  public boolean testString(String val) {
+    if (val == null) {
+      return test(null);
+    } else {
+      return test(val.getBytes(StandardCharsets.UTF_8));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/833cc0e7/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java 
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index af20d1f..5ef0ced 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1904,8 +1904,8 @@ public class TestVectorOrcFile {
             .withZeroCopy(false)
             .build());
     OrcIndex index =
-        meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
-            null);
+        meta.readRowIndex(reader.getStripes().get(0), null, null, false, null, 
null,
+            null, OrcFile.WriterVersion.ORC_101, null, null);
     // check the primitive columns to make sure they have the right number of
     // items in the first row group
     for(int c=1; c < 9; ++c) {

[3/3] orc git commit: ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

Reply via email to