This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push: new 06ee13f98 ORC-1553: Row grouping reads should be skipped when the statistics are written without any values for the SArg column 06ee13f98 is described below commit 06ee13f98513e1aa1098a0ed08cc39620d844316 Author: Yiqun Zhang <guiyanaku...@gmail.com> AuthorDate: Sat Dec 23 16:09:52 2023 -0800 ORC-1553: Row grouping reads should be skipped when the statistics are written without any values for the SArg column ### What changes were proposed in this pull request? This PR aims to fix an issue where the column statistics were incorrectly evaluated in scenarios where no values were written, resulting in the inability to skip row groups. ### Why are the changes needed? The fix improves the evaluation logic of statistics, enabling the skipping of row groups that don't need to be read, thus enhancing performance. ### How was this patch tested? Unit tests have been added to validate the changes. Closes #1692 from guiyanakuang/ORC-1553. Authored-by: Yiqun Zhang <guiyanaku...@gmail.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../java/org/apache/orc/impl/RecordReaderImpl.java | 12 ++++- .../org/apache/orc/impl/TestRecordReaderImpl.java | 52 ++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 20abe5f18..791957345 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -667,6 +667,14 @@ public class RecordReaderImpl implements RecordReader { TypeDescription type, boolean writerUsedProlepticGregorian, boolean useUTCTimestamp) { + + // When statsProto is EMPTY_COLUMN_STATISTICS + // this column does not actually provide statistics + // we cannot make any assumptions, so we return YES_NO_NULL. + if (statsProto == EMPTY_COLUMN_STATISTICS) { + return TruthValue.YES_NO_NULL; + } + ColumnStatistics cs = ColumnStatisticsImpl.deserialize( null, statsProto, writerUsedProlepticGregorian, true); ValueRange range = getValueRange(cs, predicate, useUTCTimestamp); @@ -747,8 +755,10 @@ public class RecordReaderImpl implements RecordReader { ValueRange range, BloomFilter bloomFilter, boolean useUTCTimestamp) { + // If range is invalid, that means that no value (including null) is written to this column + // we should return TruthValue.NO for any predicate. if (!range.isValid()) { - return TruthValue.YES_NO_NULL; + return TruthValue.NO; } // if we didn't have any values, everything must have been null diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index bd0cb5619..c0ff4f7e8 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; @@ -2475,6 +2476,57 @@ public class TestRecordReaderImpl { assertEquals(TruthValue.YES_NO_NULL, truthValue); } + @Test + public void testStatisticsWithNoWrites() throws Exception { + Path testFilePath = new Path(workDir, "rowIndexStrideNegative.orc"); + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.get(conf); + fs.delete(testFilePath, true); + + TypeDescription schema = TypeDescription.fromString("struct<x:struct<z:double>,y:int>"); + Writer writer = OrcFile.createWriter( + testFilePath, + OrcFile.writerOptions(conf).setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + StructColumnVector structColumnVector = (StructColumnVector) batch.cols[0]; + LongColumnVector longColumnVector = (LongColumnVector) batch.cols[1]; + structColumnVector.ensureSize(1024, false); + structColumnVector.noNulls = false; + for (int i = 0; i < 1024; i++) { + structColumnVector.isNull[i] = true; + longColumnVector.vector[i] = i; + } + batch.size = 1024; + writer.addRowBatch(batch); + batch.reset(); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + + try (RecordReader rr = reader.rows()) { + RecordReaderImpl rri = (RecordReaderImpl) rr; + // x.z id is 2, We just need to read this column + OrcIndex orcIndex = rri.readRowIndex(0, + new boolean[] { false, false, true, false }, + new boolean[] { false, false, true, false }); + OrcProto.RowIndex[] rowGroupIndex = orcIndex.getRowGroupIndex(); + OrcProto.ColumnStatistics statistics = rowGroupIndex[2].getEntry(0).getStatistics(); + OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT) + .build(); + PredicateLeaf pred = createPredicateLeaf( + PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x.z", 1.0, null); + + TruthValue truthValue = RecordReaderImpl.evaluatePredicateProto( + statistics, + pred, null, encoding, null, + CURRENT_WRITER, TypeDescription.createInt()); + + assertEquals(TruthValue.NO, truthValue); + } + } + @Test public void testDoubleColumnWithoutDoubleStatistics() throws Exception { // orc-file-no-double-statistic.orc is an orc file created by cudf with a schema of