This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 4ebfd94ff ORC-1604: Deprecate non-utf8 bloom filter for Java writer
4ebfd94ff is described below

commit 4ebfd94ff75e9f0b77230ae611f33008e0ac88dd
Author: sychen <[email protected]>
AuthorDate: Sat Feb 3 01:12:21 2024 -0800

    ORC-1604: Deprecate non-utf8 bloom filter for Java writer
    
    ### What changes were proposed in this pull request?
    This PR aims to deprecate non-utf8 bloom filter for writer.
    1. deprecate `org.apache.orc.OrcFile.WriterOptions#bloomFilterVersion`
    2. deprecate `org.apache.orc.OrcFile.WriterOptions#getBloomFilterVersion
    3. deprecate 
`org.apache.orc.impl.writer.WriterContext#getBloomFilterVersion`
    
    ### Why are the changes needed?
    1. `orc.bloom.filter.write.version=original` will write two copies of data 
instead of one, which increases the size of ORC and will also cause Spark2.x to 
fail to read `BloomFilterUtf8`
    
[comment-17800800](https://issues.apache.org/jira/browse/ORC-297?focusedCommentId=17800800&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17800800)
    4. C++ writer does not implement original
    5. Plan to remove non-utf8 bloom filter in `orc-format` `ORCv2.md`
    
    ### How was this patch tested?
    GA
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1776 from cxzl25/ORC-1604.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 6c3c451c59b99b448d9bb5dd18c9fb70a9812123)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 java/core/src/java/org/apache/orc/OrcConf.java                   | 2 +-
 java/core/src/java/org/apache/orc/OrcFile.java                   | 2 ++
 java/core/src/java/org/apache/orc/impl/WriterImpl.java           | 1 +
 java/core/src/java/org/apache/orc/impl/writer/WriterContext.java | 1 +
 java/core/src/test/org/apache/orc/TestStringDictionary.java      | 1 +
 site/_docs/core-java-config.md                                   | 2 +-
 6 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/OrcConf.java 
b/java/core/src/java/org/apache/orc/OrcConf.java
index fa7bc9bf3..17f5d4f3c 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -131,7 +131,7 @@ public enum OrcConf {
       "", "List of columns to create bloom filters for when writing."),
   BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
       "orc.bloom.filter.write.version", 
OrcFile.BloomFilterVersion.UTF8.toString(),
-      "Which version of the bloom filters should we write.\n" +
+      "(Deprecated) Which version of the bloom filters should we write.\n" +
           "The choices are:\n" +
           "  original - writes two versions of the bloom filters for use by\n" 
+
           "             both old and new readers.\n" +
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java 
b/java/core/src/java/org/apache/orc/OrcFile.java
index 86444868b..278c0813e 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -723,6 +723,7 @@ public class OrcFile {
     /**
      * Set the version of the bloom filters to write.
      */
+    @Deprecated
     public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
       this.bloomFilterVersion = version;
       return this;
@@ -978,6 +979,7 @@ public class OrcFile {
       return bloomFilterFpp;
     }
 
+    @Deprecated
     public BloomFilterVersion getBloomFilterVersion() {
       return bloomFilterVersion;
     }
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java 
b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index c028228ef..bd1e6afad 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -450,6 +450,7 @@ public class WriterImpl implements WriterInternal, 
MemoryManager.Callback {
     }
 
     @Override
+    @Deprecated
     public OrcFile.BloomFilterVersion getBloomFilterVersion() {
       return bloomFilterVersion;
     }
diff --git a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java 
b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
index e9534c6f0..03c31b660 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
@@ -83,6 +83,7 @@ public interface WriterContext {
    */
   OrcFile.Version getVersion();
 
+  @Deprecated
   OrcFile.BloomFilterVersion getBloomFilterVersion();
 
   void writeIndex(StreamName name,
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java 
b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index 62ccac76a..a7a1d714c 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -247,6 +247,7 @@ public class TestStringDictionary {
     }
 
     @Override
+    @Deprecated
     public OrcFile.BloomFilterVersion getBloomFilterVersion() {
       return OrcFile.BloomFilterVersion.UTF8;
     }
diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
index 0141816c8..6db1cbd0a 100644
--- a/site/_docs/core-java-config.md
+++ b/site/_docs/core-java-config.md
@@ -190,7 +190,7 @@ permalink: /docs/core-java-config.html
   <td><code>orc.bloom.filter.write.version</code></td>
   <td>utf8</td>
   <td>
-    Which version of the bloom filters should we write. The choices are:   
original - writes two versions of the bloom filters for use by              
both old and new readers.   utf8 - writes just the new bloom filters.
+    (Deprecated) Which version of the bloom filters should we write. The 
choices are:   original - writes two versions of the bloom filters for use by   
           both old and new readers.   utf8 - writes just the new bloom filters.
   </td>
 </tr>
 <tr>

Reply via email to