This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.9
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.9 by this push:
new 36886ae19 ORC-1482: Adaptation to read ORC files created by CUDF
36886ae19 is described below
commit 36886ae19afe1c04541e4c919442eb1f836306a7
Author: Yiqun Zhang <[email protected]>
AuthorDate: Thu Aug 24 20:10:19 2023 -0700
ORC-1482: Adaptation to read ORC files created by CUDF
### What changes were proposed in this pull request?
This pr is aimed at adapting to read ORC files created by CUDF, which may
have missing statistics in their DOUBLE/FLOAT columns.
### Why are the changes needed?
Official ORC readers can't read CUDF-created ORC files properly.
### How was this patch tested?
Added UT.
Closes #1595 from guiyanakuang/ORC-1482.
Lead-authored-by: Yiqun Zhang <[email protected]>
Co-authored-by: zhangyiqun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 7787669c444d0cf18ba91effbba34b5608370b5b)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 12 +++++-
.../org/apache/orc/impl/TestRecordReaderImpl.java | 42 +++++++++++++++++++++
.../resources/orc-file-no-double-statistic.orc | Bin 0 -> 161 bytes
3 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index def1fc169..6d65e9e5c 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -695,8 +695,8 @@ public class RecordReaderImpl implements RecordReader {
" include ORC-517. Writer version: {}",
predicate.getColumnName(), writerVersion);
return TruthValue.YES_NO_NULL;
- } else if (category == TypeDescription.Category.DOUBLE ||
- category == TypeDescription.Category.FLOAT) {
+ } else if ((category == TypeDescription.Category.DOUBLE ||
+ category == TypeDescription.Category.FLOAT) && cs instanceof
DoubleColumnStatistics) {
DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
if (Double.isNaN(dstas.getSum())) {
LOG.debug("Not using predication pushdown on {} because stats contain
NaN values",
@@ -1708,4 +1708,12 @@ public class RecordReaderImpl implements RecordReader {
public int getMaxDiskRangeChunkLimit() {
return maxDiskRangeChunkLimit;
}
+
+ /**
+ * Get sargApplier for testing.
+ * @return sargApplier in record reader.
+ */
+ SargApplier getSargApp() {
+ return sargApp;
+ }
}
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index a4e33b826..bd0cb5619 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -41,6 +41,7 @@ import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.DataReader;
+import org.apache.orc.DoubleColumnStatistics;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
@@ -2474,6 +2475,47 @@ public class TestRecordReaderImpl {
assertEquals(TruthValue.YES_NO_NULL, truthValue);
}
+ @Test
+ public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
+ // orc-file-no-double-statistic.orc is an orc file created by cudf with a
schema of
+ // struct<x:double>, one row and a value of null.
+ // Test file source
https://issues.apache.org/jira/projects/ORC/issues/ORC-1482
+ Path filePath = new
Path(ClassLoader.getSystemResource("orc-file-no-double-statistic.orc")
+ .getPath());
+
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+
+ Reader reader = OrcFile.createReader(filePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+
+ assertEquals(schema, reader.getSchema());
+ assertFalse(reader.getStatistics()[0] instanceof DoubleColumnStatistics);
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .isNull("x", PredicateLeaf.Type.FLOAT)
+ .build();
+
+ Reader.Options options = reader.options()
+ .searchArgument(sarg, new String[] {"x"})
+ .useSelected(true)
+ .allowSARGToFilter(true);
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ long rowCount = 0;
+ try (RecordReader rr = reader.rows(options)) {
+ assertTrue(rr.nextBatch(batch));
+ rowCount += batch.size;
+ assertFalse(rr.nextBatch(batch));
+ if (rr instanceof RecordReaderImpl) {
+ assertEquals(0, ((RecordReaderImpl)
rr).getSargApp().getExceptionCount()[0]);
+ }
+ }
+ assertEquals(1, rowCount);
+ }
+
@Test
public void testMissMinOrMaxInStatistics() {
OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
diff --git a/java/core/src/test/resources/orc-file-no-double-statistic.orc
b/java/core/src/test/resources/orc-file-no-double-statistic.orc
new file mode 100644
index 000000000..9da6e42e3
Binary files /dev/null and
b/java/core/src/test/resources/orc-file-no-double-statistic.orc differ