[orc] branch branch-1.9 updated: ORC-1482: Adaptation to read ORC files created by CUDF

dongjoon Thu, 24 Aug 2023 20:10:39 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.9
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/branch-1.9 by this push:
     new 36886ae19 ORC-1482: Adaptation to read ORC files created by CUDF
36886ae19 is described below

commit 36886ae19afe1c04541e4c919442eb1f836306a7
Author: Yiqun Zhang <[email protected]>
AuthorDate: Thu Aug 24 20:10:19 2023 -0700

    ORC-1482: Adaptation to read ORC files created by CUDF
    
    ### What changes were proposed in this pull request?
    
    This pr is aimed at adapting to read ORC files created by CUDF, which may 
have missing statistics in their DOUBLE/FLOAT columns.
    
    ### Why are the changes needed?
    
    Official ORC readers can't read CUDF-created ORC files properly.
    
    ### How was this patch tested?
    
    Added UT.
    
    Closes #1595 from guiyanakuang/ORC-1482.
    
    Lead-authored-by: Yiqun Zhang <[email protected]>
    Co-authored-by: zhangyiqun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 7787669c444d0cf18ba91effbba34b5608370b5b)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../java/org/apache/orc/impl/RecordReaderImpl.java |  12 +++++-
 .../org/apache/orc/impl/TestRecordReaderImpl.java  |  42 +++++++++++++++++++++
 .../resources/orc-file-no-double-statistic.orc     | Bin 0 -> 161 bytes
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index def1fc169..6d65e9e5c 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -695,8 +695,8 @@ public class RecordReaderImpl implements RecordReader {
                    " include ORC-517. Writer version: {}",
           predicate.getColumnName(), writerVersion);
       return TruthValue.YES_NO_NULL;
-    } else if (category == TypeDescription.Category.DOUBLE ||
-        category == TypeDescription.Category.FLOAT) {
+    } else if ((category == TypeDescription.Category.DOUBLE ||
+        category == TypeDescription.Category.FLOAT) && cs instanceof 
DoubleColumnStatistics) {
       DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
       if (Double.isNaN(dstas.getSum())) {
         LOG.debug("Not using predication pushdown on {} because stats contain 
NaN values",
@@ -1708,4 +1708,12 @@ public class RecordReaderImpl implements RecordReader {
   public int getMaxDiskRangeChunkLimit() {
     return maxDiskRangeChunkLimit;
   }
+
+  /**
+   * Get sargApplier for testing.
+   * @return sargApplier in record reader.
+   */
+  SargApplier getSargApp() {
+    return sargApp;
+  }
 }
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index a4e33b826..bd0cb5619 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -41,6 +41,7 @@ import org.apache.orc.ColumnStatistics;
 import org.apache.orc.CompressionCodec;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.DataReader;
+import org.apache.orc.DoubleColumnStatistics;
 import org.apache.orc.OrcConf;
 import org.apache.orc.OrcFile;
 import org.apache.orc.OrcProto;
@@ -2474,6 +2475,47 @@ public class TestRecordReaderImpl {
     assertEquals(TruthValue.YES_NO_NULL, truthValue);
   }
 
+  @Test
+  public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
+    // orc-file-no-double-statistic.orc is an orc file created by cudf with a 
schema of
+    // struct<x:double>, one row and a value of null.
+    // Test file source 
https://issues.apache.org/jira/projects/ORC/issues/ORC-1482
+    Path filePath = new 
Path(ClassLoader.getSystemResource("orc-file-no-double-statistic.orc")
+        .getPath());
+
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+
+    Reader reader = OrcFile.createReader(filePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+
+    TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+
+    assertEquals(schema, reader.getSchema());
+    assertFalse(reader.getStatistics()[0] instanceof DoubleColumnStatistics);
+
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .isNull("x", PredicateLeaf.Type.FLOAT)
+        .build();
+
+    Reader.Options options = reader.options()
+        .searchArgument(sarg, new String[] {"x"})
+        .useSelected(true)
+        .allowSARGToFilter(true);
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    long rowCount = 0;
+    try (RecordReader rr = reader.rows(options)) {
+      assertTrue(rr.nextBatch(batch));
+      rowCount += batch.size;
+      assertFalse(rr.nextBatch(batch));
+      if (rr instanceof RecordReaderImpl) {
+        assertEquals(0, ((RecordReaderImpl) 
rr).getSargApp().getExceptionCount()[0]);
+      }
+    }
+    assertEquals(1, rowCount);
+  }
+
   @Test
   public void testMissMinOrMaxInStatistics() {
     OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
diff --git a/java/core/src/test/resources/orc-file-no-double-statistic.orc 
b/java/core/src/test/resources/orc-file-no-double-statistic.orc
new file mode 100644
index 000000000..9da6e42e3
Binary files /dev/null and 
b/java/core/src/test/resources/orc-file-no-double-statistic.orc differ

[orc] branch branch-1.9 updated: ORC-1482: Adaptation to read ORC files created by CUDF

Reply via email to