This is an automated email from the ASF dual-hosted git repository. omalley pushed a commit to branch branch-1.6 in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.6 by this push: new de1bd43 ORC-623: Fix evaluation of SArgs over all null columns for ints and doubles. de1bd43 is described below commit de1bd43ba779178e02c0c84f6353fee9e494025a Author: Owen O'Malley <omal...@apache.org> AuthorDate: Tue Sep 1 09:22:34 2020 -0700 ORC-623: Fix evaluation of SArgs over all null columns for ints and doubles. Fixes #542 Signed-off-by: Owen O'Malley <omal...@apache.org> --- .../java/org/apache/orc/impl/RecordReaderImpl.java | 13 +- .../src/test/org/apache/orc/TestVectorOrcFile.java | 131 +++++++++++++++++++++ .../org/apache/orc/impl/TestRecordReaderImpl.java | 49 +++++--- 3 files changed, 176 insertions(+), 17 deletions(-) diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index 9358bf4..7142013 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -314,6 +314,15 @@ public class RecordReaderImpl implements RecordReader { this(predicate, lower, upper, hasNulls, false, false); } + /** + * A value range where the data is either missing or all null. + * @param predicate the predicate to test + * @param hasNulls whether there are nulls + */ + ValueRange(PredicateLeaf predicate, boolean hasNulls) { + this(predicate, null, null, hasNulls, false, false); + } + boolean hasValues() { return lower != null; } @@ -384,7 +393,9 @@ public class RecordReaderImpl implements RecordReader { static ValueRange getValueRange(ColumnStatistics index, PredicateLeaf predicate, boolean useUTCTimestamp) { - if (index instanceof IntegerColumnStatistics) { + if (index.getNumberOfValues() == 0) { + return new ValueRange<>(predicate, index.hasNull()); + } else if (index instanceof IntegerColumnStatistics) { IntegerColumnStatistics stats = (IntegerColumnStatistics) index; Long min = stats.getMinimum(); Long max = stats.getMaximum(); diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index f827202..77804ca 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -4062,6 +4062,137 @@ public class TestVectorOrcFile { assertEquals(0, batch.size); } + /** + * Test predicate pushdown on nulls, with different combinations of + * values and nulls. + */ + @Test + public void testPredicatePushdownAllNulls() throws Exception { + TypeDescription schema = createInnerSchema(); + try (Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).rowIndexStride(1024).version(fileFormat))) { + VectorizedRowBatch batch = schema.createRowBatch(); + batch.size = 1024; + + // write 1024 rows of (null, "val") + batch.cols[0].noNulls = false; + batch.cols[0].isNull[0] = true; + batch.cols[0].isRepeating = true; + batch.cols[1].isRepeating = true; + ((BytesColumnVector) batch.cols[1]).setVal(0, "val".getBytes(StandardCharsets.UTF_8)); + writer.addRowBatch(batch); + + // write 1024 rows of (123, null) + batch.cols[0].isNull[0] = false; + ((LongColumnVector) batch.cols[0]).vector[0] = 123; + batch.cols[1].noNulls = false; + batch.cols[1].isNull[0] = true; + writer.addRowBatch(batch); + + // write 1024 rows of (null, null) + batch.cols[0].isNull[0] = true; + writer.addRowBatch(batch); + } + + try (Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs))) { + assertEquals(3072, reader.getNumberOfRows()); + VectorizedRowBatch batch = reader.getSchema().createRowBatch(); + + // int1 is not null + SearchArgument sarg = + SearchArgumentFactory.newBuilder() + .startNot() + .isNull("int1", PredicateLeaf.Type.LONG) + .end() + .build(); + // should find one row group + try (RecordReader rows = reader.rows(reader.options().searchArgument(sarg, new String[]{}))) { + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertEquals(true, batch.cols[0].isRepeating); + assertEquals(123, ((LongColumnVector) batch.cols[0]).vector[0]); + assertEquals(false, rows.nextBatch(batch)); + } + + // string1 is not null + sarg = SearchArgumentFactory.newBuilder() + .startNot() + .isNull("string1", PredicateLeaf.Type.STRING) + .end() + .build(); + // should find one row group + try (RecordReader rows = reader.rows(reader.options().searchArgument(sarg, new String[]{}))) { + rows.nextBatch(batch); + assertEquals(1024, batch.size); + assertEquals(true, batch.cols[1].isRepeating); + assertEquals("val", ((BytesColumnVector) batch.cols[1]).toString(0)); + assertEquals(false, rows.nextBatch(batch)); + } + } + } + + /** + * Write three row groups, one with (null, null), one with (1, "val"), and one with + * alternating rows. + */ + @Test + public void testPredicatePushdownMixedNulls() throws Exception { + TypeDescription schema = createInnerSchema(); + try (Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .setSchema(schema) + .rowIndexStride(1024) + .version(fileFormat))) { + VectorizedRowBatch batch = schema.createRowBatch(); + batch.cols[0].noNulls = false; + batch.cols[1].noNulls = false; + batch.size = 1024; + for (int b = 0; b < 3; ++b) { + for (int i = 0; i < batch.size; ++i) { + if (b == 0 || (b == 2 && i % 2 == 0)) { + batch.cols[0].isNull[i] = true; // every other value is null or 1 + batch.cols[1].isNull[i] = true; // every other value is null or "val" + } else { + batch.cols[0].isNull[i] = false; + ((LongColumnVector) batch.cols[0]).vector[i] = 1; + batch.cols[1].isNull[i] = false; + ((BytesColumnVector) batch.cols[1]).setVal(i, "val".getBytes(StandardCharsets.UTF_8)); + } + } + writer.addRowBatch(batch); + } + } + + try (Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs))) { + assertEquals(3*1024, reader.getNumberOfRows()); + VectorizedRowBatch batch = reader.getSchema().createRowBatch(); + + // int1 not in (1) -- should select 0 of the row groups + SearchArgument sarg = + SearchArgumentFactory.newBuilder() + .startNot() + .in("int1", PredicateLeaf.Type.LONG, 1L) + .end().build(); + + try (RecordReader rows = reader.rows(reader.options().searchArgument(sarg, new String[]{}))) { + assertEquals(false, rows.nextBatch(batch)); + } + + + // string1 not in ("val") -- should select 0 of the row groups + sarg = SearchArgumentFactory.newBuilder() + .startNot() + .in("string1", PredicateLeaf.Type.STRING, "val") + .end().build(); + + try (RecordReader rows = reader.rows(reader.options().searchArgument(sarg, new String[]{}))) { + assertEquals(false, rows.nextBatch(batch)); + } + } + } + @Test public void testColumnEncryption() throws Exception { Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index 6ca24a6..9f3ef90 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -378,19 +378,22 @@ public class TestRecordReaderImpl { assertEquals(10.0d, RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() - .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder() - .setMinimum(10.0d).setMaximum(100.0d).build()).build()), + .setNumberOfValues(1) + .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder() + .setMinimum(10.0d).setMaximum(100.0d).build()).build()), new StubPredicate(PredicateLeaf.Type.FLOAT), true).lower); assertEquals(null, RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() + .setNumberOfValues(1) .setStringStatistics(OrcProto.StringStatistics.newBuilder().build()) .build()), new StubPredicate(PredicateLeaf.Type.STRING),true).lower); assertEquals("a", RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() - .setStringStatistics(OrcProto.StringStatistics.newBuilder() - .setMinimum("a").setMaximum("b").build()).build()), + .setNumberOfValues(1) + .setStringStatistics(OrcProto.StringStatistics.newBuilder() + .setMinimum("a").setMaximum("b").build()).build()), new StubPredicate(PredicateLeaf.Type.STRING), true).lower); assertEquals("hello", RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, createStringStats("hello", "world")), @@ -403,16 +406,18 @@ public class TestRecordReaderImpl { private static OrcProto.ColumnStatistics createIntStats(Long min, Long max) { + OrcProto.ColumnStatistics.Builder result = + OrcProto.ColumnStatistics.newBuilder(); OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder(); if (min != null) { intStats.setMinimum(min); + result.setNumberOfValues(1); } if (max != null) { intStats.setMaximum(max); } - return OrcProto.ColumnStatistics.newBuilder() - .setIntStatistics(intStats.build()).build(); + return result.setIntStatistics(intStats.build()).build(); } private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) { @@ -428,7 +433,8 @@ public class TestRecordReaderImpl { intStats.setMinimum(min); intStats.setMaximum(max); return OrcProto.ColumnStatistics.newBuilder() - .setIntStatistics(intStats.build()).build(); + .setNumberOfValues(1) + .setIntStatistics(intStats.build()).build(); } private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) { @@ -436,7 +442,8 @@ public class TestRecordReaderImpl { dblStats.setMinimum(min); dblStats.setMaximum(max); return OrcProto.ColumnStatistics.newBuilder() - .setDoubleStatistics(dblStats.build()).build(); + .setNumberOfValues(1) + .setDoubleStatistics(dblStats.build()).build(); } //fixme @@ -445,7 +452,9 @@ public class TestRecordReaderImpl { OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder(); strStats.setMinimum(min); strStats.setMaximum(max); - return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()) + return OrcProto.ColumnStatistics.newBuilder() + .setNumberOfValues(1) + .setStringStatistics(strStats.build()) .setHasNull(hasNull).build(); } @@ -453,7 +462,9 @@ public class TestRecordReaderImpl { OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder(); strStats.setMinimum(min); strStats.setMaximum(max); - return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()) + return OrcProto.ColumnStatistics.newBuilder() + .setNumberOfValues(1) + .setStringStatistics(strStats.build()) .build(); } @@ -472,7 +483,8 @@ public class TestRecordReaderImpl { tsStats.setMinimumUtc(getUtcTimestamp(min)); tsStats.setMaximumUtc(getUtcTimestamp(max)); return OrcProto.ColumnStatistics.newBuilder() - .setTimestampStatistics(tsStats.build()).build(); + .setNumberOfValues(1) + .setTimestampStatistics(tsStats.build()).build(); } private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) { @@ -484,7 +496,9 @@ public class TestRecordReaderImpl { OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder(); decStats.setMinimum(min); decStats.setMaximum(max); - return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build()) + return OrcProto.ColumnStatistics.newBuilder() + .setNumberOfValues(1) + .setDecimalStatistics(decStats.build()) .setHasNull(hasNull).build(); } @@ -496,18 +510,21 @@ public class TestRecordReaderImpl { assertEquals(100.0d, RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() - .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder() + .setNumberOfValues(1) + .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder() .setMinimum(10.0d).setMaximum(100.0d).build()).build()), new StubPredicate(PredicateLeaf.Type.FLOAT), true).upper); assertEquals(null, RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() - .setStringStatistics(OrcProto.StringStatistics.newBuilder().build()) - .build()), new StubPredicate(PredicateLeaf.Type.STRING), true).upper); + .setNumberOfValues(1) + .setStringStatistics(OrcProto.StringStatistics.newBuilder().build()) + .build()), new StubPredicate(PredicateLeaf.Type.STRING), true).upper); assertEquals("b", RecordReaderImpl.getValueRange( ColumnStatisticsImpl.deserialize(null, OrcProto.ColumnStatistics.newBuilder() - .setStringStatistics(OrcProto.StringStatistics.newBuilder() + .setNumberOfValues(1) + .setStringStatistics(OrcProto.StringStatistics.newBuilder() .setMinimum("a").setMaximum("b").build()).build()), new StubPredicate(PredicateLeaf.Type.STRING), true).upper); assertEquals("world", RecordReaderImpl.getValueRange(