This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new d17bd64 ORC-623: Fix evaluation of SArgs over all null columns for
ints and doubles.
d17bd64 is described below
commit d17bd6418c3f6fa797a72685fa03694768fae0cc
Author: Owen O'Malley <[email protected]>
AuthorDate: Tue Sep 1 09:22:34 2020 -0700
ORC-623: Fix evaluation of SArgs over all null columns for ints
and doubles.
Fixes #542
Signed-off-by: Owen O'Malley <[email protected]>
---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 13 +-
.../src/test/org/apache/orc/TestVectorOrcFile.java | 131 +++++++++++++++++++++
.../org/apache/orc/impl/TestRecordReaderImpl.java | 49 +++++---
3 files changed, 176 insertions(+), 17 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index ec07c44..2c0998a 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -333,6 +333,15 @@ public class RecordReaderImpl implements RecordReader {
this(predicate, lower, upper, hasNulls, false, false);
}
+ /**
+ * A value range where the data is either missing or all null.
+ * @param predicate the predicate to test
+ * @param hasNulls whether there are nulls
+ */
+ ValueRange(PredicateLeaf predicate, boolean hasNulls) {
+ this(predicate, null, null, hasNulls, false, false);
+ }
+
boolean hasValues() {
return lower != null;
}
@@ -403,7 +412,9 @@ public class RecordReaderImpl implements RecordReader {
static ValueRange getValueRange(ColumnStatistics index,
PredicateLeaf predicate,
boolean useUTCTimestamp) {
- if (index instanceof IntegerColumnStatistics) {
+ if (index.getNumberOfValues() == 0) {
+ return new ValueRange<>(predicate, index.hasNull());
+ } else if (index instanceof IntegerColumnStatistics) {
IntegerColumnStatistics stats = (IntegerColumnStatistics) index;
Long min = stats.getMinimum();
Long max = stats.getMaximum();
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 1f2c8ee..11e8308 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -4062,6 +4062,137 @@ public class TestVectorOrcFile {
assertEquals(0, batch.size);
}
+ /**
+ * Test predicate pushdown on nulls, with different combinations of
+ * values and nulls.
+ */
+ @Test
+ public void testPredicatePushdownAllNulls() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ try (Writer writer = OrcFile.createWriter(testFilePath,
+
OrcFile.writerOptions(conf).setSchema(schema).rowIndexStride(1024).version(fileFormat)))
{
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1024;
+
+ // write 1024 rows of (null, "val")
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[0] = true;
+ batch.cols[0].isRepeating = true;
+ batch.cols[1].isRepeating = true;
+ ((BytesColumnVector) batch.cols[1]).setVal(0,
"val".getBytes(StandardCharsets.UTF_8));
+ writer.addRowBatch(batch);
+
+ // write 1024 rows of (1, null)
+ batch.cols[0].isNull[0] = false;
+ ((LongColumnVector) batch.cols[0]).vector[0] = 123;
+ batch.cols[1].noNulls = false;
+ batch.cols[1].isNull[0] = true;
+ writer.addRowBatch(batch);
+
+ // write 1024 rows of (null, null)
+ batch.cols[0].isNull[0] = true;
+ writer.addRowBatch(batch);
+ }
+
+ try (Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+ assertEquals(3072, reader.getNumberOfRows());
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+
+ // int1 is not null
+ SearchArgument sarg =
+ SearchArgumentFactory.newBuilder()
+ .startNot()
+ .isNull("int1", PredicateLeaf.Type.LONG)
+ .end()
+ .build();
+ // should find one row group
+ try (RecordReader rows =
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+ rows.nextBatch(batch);
+ assertEquals(1024, batch.size);
+ assertEquals(true, batch.cols[0].isRepeating);
+ assertEquals(123, ((LongColumnVector) batch.cols[0]).vector[0]);
+ assertEquals(false, rows.nextBatch(batch));
+ }
+
+ // string1 is not null
+ sarg = SearchArgumentFactory.newBuilder()
+ .startNot()
+ .isNull("string1", PredicateLeaf.Type.STRING)
+ .end()
+ .build();
+ // should find one row group
+ try (RecordReader rows =
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+ rows.nextBatch(batch);
+ assertEquals(1024, batch.size);
+ assertEquals(true, batch.cols[1].isRepeating);
+ assertEquals("val", ((BytesColumnVector) batch.cols[1]).toString(0));
+ assertEquals(false, rows.nextBatch(batch));
+ }
+ }
+ }
+
+ /**
+ * Write three row groups, one with (null, null), one with (1, "val"), and
one with
+ * alternating rows.
+ */
+ @Test
+ public void testPredicatePushdownMixedNulls() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ try (Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .rowIndexStride(1024)
+ .version(fileFormat))) {
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.cols[0].noNulls = false;
+ batch.cols[1].noNulls = false;
+ batch.size = 1024;
+ for (int b = 0; b < 3; ++b) {
+ for (int i = 0; i < batch.size; ++i) {
+ if (b == 0 || (b == 2 && i % 2 == 0)) {
+ batch.cols[0].isNull[i] = true; // every other value is null or 1
+ batch.cols[1].isNull[i] = true; // every other value is null or
"val"
+ } else {
+ batch.cols[0].isNull[i] = false;
+ ((LongColumnVector) batch.cols[0]).vector[i] = 1;
+ batch.cols[1].isNull[i] = false;
+ ((BytesColumnVector) batch.cols[1]).setVal(i,
"val".getBytes(StandardCharsets.UTF_8));
+ }
+ }
+ writer.addRowBatch(batch);
+ }
+ }
+
+ try (Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+ assertEquals(3*1024, reader.getNumberOfRows());
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+
+ // int1 is not null -- should select 0 of the row groups
+ SearchArgument sarg =
+ SearchArgumentFactory.newBuilder()
+ .startNot()
+ .in("int1", PredicateLeaf.Type.LONG, 1L)
+ .end().build();
+
+ try (RecordReader rows =
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+ assertEquals(false, rows.nextBatch(batch));
+ }
+
+
+ // string1 is not null -- should select 0 of the row groups
+ sarg = SearchArgumentFactory.newBuilder()
+ .startNot()
+ .in("string1", PredicateLeaf.Type.STRING, "val")
+ .end().build();
+
+ try (RecordReader rows =
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+ assertEquals(false, rows.nextBatch(batch));
+ }
+ }
+ }
+
@Test
public void testColumnEncryption() throws Exception {
Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 6ca24a6..9f3ef90 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -378,19 +378,22 @@ public class TestRecordReaderImpl {
assertEquals(10.0d, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
- .setMinimum(10.0d).setMaximum(100.0d).build()).build()),
+ .setNumberOfValues(1)
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setMinimum(10.0d).setMaximum(100.0d).build()).build()),
new StubPredicate(PredicateLeaf.Type.FLOAT), true).lower);
assertEquals(null, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
+ .setNumberOfValues(1)
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
.build()), new
StubPredicate(PredicateLeaf.Type.STRING),true).lower);
assertEquals("a", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
- .setMinimum("a").setMaximum("b").build()).build()),
+ .setNumberOfValues(1)
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setMinimum("a").setMaximum("b").build()).build()),
new StubPredicate(PredicateLeaf.Type.STRING), true).lower);
assertEquals("hello", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null, createStringStats("hello",
"world")),
@@ -403,16 +406,18 @@ public class TestRecordReaderImpl {
private static OrcProto.ColumnStatistics createIntStats(Long min,
Long max) {
+ OrcProto.ColumnStatistics.Builder result =
+ OrcProto.ColumnStatistics.newBuilder();
OrcProto.IntegerStatistics.Builder intStats =
OrcProto.IntegerStatistics.newBuilder();
if (min != null) {
intStats.setMinimum(min);
+ result.setNumberOfValues(1);
}
if (max != null) {
intStats.setMaximum(max);
}
- return OrcProto.ColumnStatistics.newBuilder()
- .setIntStatistics(intStats.build()).build();
+ return result.setIntStatistics(intStats.build()).build();
}
private static OrcProto.ColumnStatistics createBooleanStats(int n, int
trueCount) {
@@ -428,7 +433,8 @@ public class TestRecordReaderImpl {
intStats.setMinimum(min);
intStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder()
- .setIntStatistics(intStats.build()).build();
+ .setNumberOfValues(1)
+ .setIntStatistics(intStats.build()).build();
}
private static OrcProto.ColumnStatistics createDoubleStats(double min,
double max) {
@@ -436,7 +442,8 @@ public class TestRecordReaderImpl {
dblStats.setMinimum(min);
dblStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(dblStats.build()).build();
+ .setNumberOfValues(1)
+ .setDoubleStatistics(dblStats.build()).build();
}
//fixme
@@ -445,7 +452,9 @@ public class TestRecordReaderImpl {
OrcProto.StringStatistics.Builder strStats =
OrcProto.StringStatistics.newBuilder();
strStats.setMinimum(min);
strStats.setMaximum(max);
- return
OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+ return OrcProto.ColumnStatistics.newBuilder()
+ .setNumberOfValues(1)
+ .setStringStatistics(strStats.build())
.setHasNull(hasNull).build();
}
@@ -453,7 +462,9 @@ public class TestRecordReaderImpl {
OrcProto.StringStatistics.Builder strStats =
OrcProto.StringStatistics.newBuilder();
strStats.setMinimum(min);
strStats.setMaximum(max);
- return
OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+ return OrcProto.ColumnStatistics.newBuilder()
+ .setNumberOfValues(1)
+ .setStringStatistics(strStats.build())
.build();
}
@@ -472,7 +483,8 @@ public class TestRecordReaderImpl {
tsStats.setMinimumUtc(getUtcTimestamp(min));
tsStats.setMaximumUtc(getUtcTimestamp(max));
return OrcProto.ColumnStatistics.newBuilder()
- .setTimestampStatistics(tsStats.build()).build();
+ .setNumberOfValues(1)
+ .setTimestampStatistics(tsStats.build()).build();
}
private static OrcProto.ColumnStatistics createDecimalStats(String min,
String max) {
@@ -484,7 +496,9 @@ public class TestRecordReaderImpl {
OrcProto.DecimalStatistics.Builder decStats =
OrcProto.DecimalStatistics.newBuilder();
decStats.setMinimum(min);
decStats.setMaximum(max);
- return
OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
+ return OrcProto.ColumnStatistics.newBuilder()
+ .setNumberOfValues(1)
+ .setDecimalStatistics(decStats.build())
.setHasNull(hasNull).build();
}
@@ -496,18 +510,21 @@ public class TestRecordReaderImpl {
assertEquals(100.0d, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setNumberOfValues(1)
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
.setMinimum(10.0d).setMaximum(100.0d).build()).build()),
new StubPredicate(PredicateLeaf.Type.FLOAT), true).upper);
assertEquals(null, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
-
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
- .build()), new StubPredicate(PredicateLeaf.Type.STRING),
true).upper);
+ .setNumberOfValues(1)
+
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+ .build()), new StubPredicate(PredicateLeaf.Type.STRING),
true).upper);
assertEquals("b", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setNumberOfValues(1)
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
.setMinimum("a").setMaximum("b").build()).build()),
new StubPredicate(PredicateLeaf.Type.STRING), true).upper);
assertEquals("world", RecordReaderImpl.getValueRange(