This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 4ebfd94ff ORC-1604: Deprecate non-utf8 bloom filter for Java writer
4ebfd94ff is described below
commit 4ebfd94ff75e9f0b77230ae611f33008e0ac88dd
Author: sychen <[email protected]>
AuthorDate: Sat Feb 3 01:12:21 2024 -0800
ORC-1604: Deprecate non-utf8 bloom filter for Java writer
### What changes were proposed in this pull request?
This PR aims to deprecate non-utf8 bloom filter for writer.
1. deprecate `org.apache.orc.OrcFile.WriterOptions#bloomFilterVersion`
2. deprecate `org.apache.orc.OrcFile.WriterOptions#getBloomFilterVersion
3. deprecate
`org.apache.orc.impl.writer.WriterContext#getBloomFilterVersion`
### Why are the changes needed?
1. `orc.bloom.filter.write.version=original` will write two copies of data
instead of one, which increases the size of ORC and will also cause Spark2.x to
fail to read `BloomFilterUtf8`
[comment-17800800](https://issues.apache.org/jira/browse/ORC-297?focusedCommentId=17800800&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17800800)
4. C++ writer does not implement original
5. Plan to remove non-utf8 bloom filter in `orc-format` `ORCv2.md`
### How was this patch tested?
GA
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1776 from cxzl25/ORC-1604.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 6c3c451c59b99b448d9bb5dd18c9fb70a9812123)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
java/core/src/java/org/apache/orc/OrcConf.java | 2 +-
java/core/src/java/org/apache/orc/OrcFile.java | 2 ++
java/core/src/java/org/apache/orc/impl/WriterImpl.java | 1 +
java/core/src/java/org/apache/orc/impl/writer/WriterContext.java | 1 +
java/core/src/test/org/apache/orc/TestStringDictionary.java | 1 +
site/_docs/core-java-config.md | 2 +-
6 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java
b/java/core/src/java/org/apache/orc/OrcConf.java
index fa7bc9bf3..17f5d4f3c 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -131,7 +131,7 @@ public enum OrcConf {
"", "List of columns to create bloom filters for when writing."),
BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
"orc.bloom.filter.write.version",
OrcFile.BloomFilterVersion.UTF8.toString(),
- "Which version of the bloom filters should we write.\n" +
+ "(Deprecated) Which version of the bloom filters should we write.\n" +
"The choices are:\n" +
" original - writes two versions of the bloom filters for use by\n"
+
" both old and new readers.\n" +
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java
b/java/core/src/java/org/apache/orc/OrcFile.java
index 86444868b..278c0813e 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -723,6 +723,7 @@ public class OrcFile {
/**
* Set the version of the bloom filters to write.
*/
+ @Deprecated
public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
this.bloomFilterVersion = version;
return this;
@@ -978,6 +979,7 @@ public class OrcFile {
return bloomFilterFpp;
}
+ @Deprecated
public BloomFilterVersion getBloomFilterVersion() {
return bloomFilterVersion;
}
diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index c028228ef..bd1e6afad 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -450,6 +450,7 @@ public class WriterImpl implements WriterInternal,
MemoryManager.Callback {
}
@Override
+ @Deprecated
public OrcFile.BloomFilterVersion getBloomFilterVersion() {
return bloomFilterVersion;
}
diff --git a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
index e9534c6f0..03c31b660 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java
@@ -83,6 +83,7 @@ public interface WriterContext {
*/
OrcFile.Version getVersion();
+ @Deprecated
OrcFile.BloomFilterVersion getBloomFilterVersion();
void writeIndex(StreamName name,
diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java
b/java/core/src/test/org/apache/orc/TestStringDictionary.java
index 62ccac76a..a7a1d714c 100644
--- a/java/core/src/test/org/apache/orc/TestStringDictionary.java
+++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java
@@ -247,6 +247,7 @@ public class TestStringDictionary {
}
@Override
+ @Deprecated
public OrcFile.BloomFilterVersion getBloomFilterVersion() {
return OrcFile.BloomFilterVersion.UTF8;
}
diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
index 0141816c8..6db1cbd0a 100644
--- a/site/_docs/core-java-config.md
+++ b/site/_docs/core-java-config.md
@@ -190,7 +190,7 @@ permalink: /docs/core-java-config.html
<td><code>orc.bloom.filter.write.version</code></td>
<td>utf8</td>
<td>
- Which version of the bloom filters should we write. The choices are:
original - writes two versions of the bloom filters for use by
both old and new readers. utf8 - writes just the new bloom filters.
+ (Deprecated) Which version of the bloom filters should we write. The
choices are: original - writes two versions of the bloom filters for use by
both old and new readers. utf8 - writes just the new bloom filters.
</td>
</tr>
<tr>