(orc) branch main updated: ORC-1553: Row grouping reads should be skipped when the statistics are written without any values for the SArg column

dongjoon Sat, 23 Dec 2023 16:10:12 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new 06ee13f98 ORC-1553: Row grouping reads should be skipped when the 
statistics are written without any values for the SArg column
06ee13f98 is described below

commit 06ee13f98513e1aa1098a0ed08cc39620d844316
Author: Yiqun Zhang <guiyanaku...@gmail.com>
AuthorDate: Sat Dec 23 16:09:52 2023 -0800

    ORC-1553: Row grouping reads should be skipped when the statistics are 
written without any values for the SArg column
    
    ### What changes were proposed in this pull request?
    This PR aims to fix an issue where the column statistics were incorrectly 
evaluated in scenarios where no values were written, resulting in the inability 
to skip row groups.
    
    ### Why are the changes needed?
    The fix improves the evaluation logic of statistics, enabling the skipping 
of row groups that don't need to be read, thus enhancing performance.
    
    ### How was this patch tested?
    Unit tests have been added to validate the changes.
    
    Closes #1692 from guiyanakuang/ORC-1553.
    
    Authored-by: Yiqun Zhang <guiyanaku...@gmail.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../java/org/apache/orc/impl/RecordReaderImpl.java | 12 ++++-
 .../org/apache/orc/impl/TestRecordReaderImpl.java  | 52 ++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 20abe5f18..791957345 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -667,6 +667,14 @@ public class RecordReaderImpl implements RecordReader {
                                            TypeDescription type,
                                            boolean 
writerUsedProlepticGregorian,
                                            boolean useUTCTimestamp) {
+
+    // When statsProto is EMPTY_COLUMN_STATISTICS
+    // this column does not actually provide statistics
+    // we cannot make any assumptions, so we return YES_NO_NULL.
+    if (statsProto == EMPTY_COLUMN_STATISTICS) {
+      return TruthValue.YES_NO_NULL;
+    }
+
     ColumnStatistics cs = ColumnStatisticsImpl.deserialize(
         null, statsProto, writerUsedProlepticGregorian, true);
     ValueRange range = getValueRange(cs, predicate, useUTCTimestamp);
@@ -747,8 +755,10 @@ public class RecordReaderImpl implements RecordReader {
                                            ValueRange range,
                                            BloomFilter bloomFilter,
                                            boolean useUTCTimestamp) {
+    // If range is invalid, that means that no value (including null) is 
written to this column
+    // we should return TruthValue.NO for any predicate.
     if (!range.isValid()) {
-      return TruthValue.YES_NO_NULL;
+      return TruthValue.NO;
     }
 
     // if we didn't have any values, everything must have been null
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index bd0cb5619..c0ff4f7e8 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -29,6 +29,7 @@ import org.apache.hadoop.hive.common.io.DiskRangeList;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
@@ -2475,6 +2476,57 @@ public class TestRecordReaderImpl {
     assertEquals(TruthValue.YES_NO_NULL, truthValue);
   }
 
+  @Test
+  public void testStatisticsWithNoWrites() throws Exception {
+    Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc");
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    fs.delete(testFilePath, true);
+
+    TypeDescription schema = 
TypeDescription.fromString("struct<x:struct<z:double>,y:int>");
+    Writer writer = OrcFile.createWriter(
+        testFilePath,
+        OrcFile.writerOptions(conf).setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    StructColumnVector structColumnVector = (StructColumnVector) batch.cols[0];
+    LongColumnVector longColumnVector = (LongColumnVector) batch.cols[1];
+    structColumnVector.ensureSize(1024, false);
+    structColumnVector.noNulls = false;
+    for (int i = 0; i < 1024; i++) {
+      structColumnVector.isNull[i] = true;
+      longColumnVector.vector[i] = i;
+    }
+    batch.size = 1024;
+    writer.addRowBatch(batch);
+    batch.reset();
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+
+    try (RecordReader rr = reader.rows()) {
+      RecordReaderImpl rri = (RecordReaderImpl) rr;
+      // x.z id is 2, We just need to read this column
+      OrcIndex orcIndex = rri.readRowIndex(0,
+          new boolean[] { false, false, true, false },
+          new boolean[] { false, false, true, false });
+      OrcProto.RowIndex[] rowGroupIndex = orcIndex.getRowGroupIndex();
+      OrcProto.ColumnStatistics statistics = 
rowGroupIndex[2].getEntry(0).getStatistics();
+      OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
+          .setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
+          .build();
+      PredicateLeaf pred = createPredicateLeaf(
+          PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x.z", 1.0, 
null);
+
+      TruthValue truthValue = RecordReaderImpl.evaluatePredicateProto(
+          statistics,
+          pred, null, encoding, null,
+          CURRENT_WRITER, TypeDescription.createInt());
+
+      assertEquals(TruthValue.NO, truthValue);
+    }
+  }
+
   @Test
   public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
     // orc-file-no-double-statistic.orc is an orc file created by cudf with a 
schema of

(orc) branch main updated: ORC-1553: Row grouping reads should be skipped when the statistics are written without any values for the SArg column

Reply via email to