This is an automated email from the ASF dual-hosted git repository.

omalley pushed a commit to branch branch-1.6
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.6 by this push:
     new de1bd43  ORC-623: Fix evaluation of SArgs over all null columns for 
ints and doubles.
de1bd43 is described below

commit de1bd43ba779178e02c0c84f6353fee9e494025a
Author: Owen O'Malley <omal...@apache.org>
AuthorDate: Tue Sep 1 09:22:34 2020 -0700

    ORC-623: Fix evaluation of SArgs over all null columns for ints
    and doubles.
    
    Fixes #542
    
    Signed-off-by: Owen O'Malley <omal...@apache.org>
---
 .../java/org/apache/orc/impl/RecordReaderImpl.java |  13 +-
 .../src/test/org/apache/orc/TestVectorOrcFile.java | 131 +++++++++++++++++++++
 .../org/apache/orc/impl/TestRecordReaderImpl.java  |  49 +++++---
 3 files changed, 176 insertions(+), 17 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java 
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 9358bf4..7142013 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -314,6 +314,15 @@ public class RecordReaderImpl implements RecordReader {
       this(predicate, lower, upper, hasNulls, false, false);
     }
 
+    /**
+     * A value range where the data is either missing or all null.
+     * @param predicate the predicate to test
+     * @param hasNulls whether there are nulls
+     */
+    ValueRange(PredicateLeaf predicate, boolean hasNulls) {
+      this(predicate, null, null, hasNulls, false, false);
+    }
+
     boolean hasValues() {
       return lower != null;
     }
@@ -384,7 +393,9 @@ public class RecordReaderImpl implements RecordReader {
   static ValueRange getValueRange(ColumnStatistics index,
                                   PredicateLeaf predicate,
                                   boolean useUTCTimestamp) {
-    if (index instanceof IntegerColumnStatistics) {
+    if (index.getNumberOfValues() == 0) {
+      return new ValueRange<>(predicate, index.hasNull());
+    } else if (index instanceof IntegerColumnStatistics) {
       IntegerColumnStatistics stats = (IntegerColumnStatistics) index;
       Long min = stats.getMinimum();
       Long max = stats.getMaximum();
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java 
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index f827202..77804ca 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -4062,6 +4062,137 @@ public class TestVectorOrcFile {
     assertEquals(0, batch.size);
   }
 
+  /**
+   * Test predicate pushdown on nulls, with different combinations of
+   * values and nulls.
+   */
+  @Test
+  public void testPredicatePushdownAllNulls() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    try (Writer writer = OrcFile.createWriter(testFilePath,
+        
OrcFile.writerOptions(conf).setSchema(schema).rowIndexStride(1024).version(fileFormat)))
 {
+      VectorizedRowBatch batch = schema.createRowBatch();
+      batch.size = 1024;
+
+      // write 1024 rows of (null, "val")
+      batch.cols[0].noNulls = false;
+      batch.cols[0].isNull[0] = true;
+      batch.cols[0].isRepeating = true;
+      batch.cols[1].isRepeating = true;
+      ((BytesColumnVector) batch.cols[1]).setVal(0, 
"val".getBytes(StandardCharsets.UTF_8));
+      writer.addRowBatch(batch);
+
+      // write 1024 rows of (123, null)
+      batch.cols[0].isNull[0] = false;
+      ((LongColumnVector) batch.cols[0]).vector[0] = 123;
+      batch.cols[1].noNulls = false;
+      batch.cols[1].isNull[0] = true;
+      writer.addRowBatch(batch);
+
+      // write 1024 rows of (null, null)
+      batch.cols[0].isNull[0] = true;
+      writer.addRowBatch(batch);
+    }
+
+    try (Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs))) {
+      assertEquals(3072, reader.getNumberOfRows());
+      VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+
+      // int1 is not null
+      SearchArgument sarg =
+          SearchArgumentFactory.newBuilder()
+              .startNot()
+              .isNull("int1", PredicateLeaf.Type.LONG)
+              .end()
+              .build();
+      // should find one row group
+      try (RecordReader rows = 
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+        rows.nextBatch(batch);
+        assertEquals(1024, batch.size);
+        assertEquals(true, batch.cols[0].isRepeating);
+        assertEquals(123, ((LongColumnVector) batch.cols[0]).vector[0]);
+        assertEquals(false, rows.nextBatch(batch));
+      }
+
+      // string1 is not null
+      sarg = SearchArgumentFactory.newBuilder()
+          .startNot()
+          .isNull("string1", PredicateLeaf.Type.STRING)
+          .end()
+          .build();
+      // should find one row group
+      try (RecordReader rows = 
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+        rows.nextBatch(batch);
+        assertEquals(1024, batch.size);
+        assertEquals(true, batch.cols[1].isRepeating);
+        assertEquals("val", ((BytesColumnVector) batch.cols[1]).toString(0));
+        assertEquals(false, rows.nextBatch(batch));
+      }
+    }
+  }
+
+  /**
+   * Write three row groups, one with (null, null), one with (1, "val"), and 
one with
+   * alternating rows.
+   */
+  @Test
+  public void testPredicatePushdownMixedNulls() throws Exception {
+    TypeDescription schema = createInnerSchema();
+    try (Writer writer = OrcFile.createWriter(testFilePath,
+                           OrcFile.writerOptions(conf)
+                                  .setSchema(schema)
+                                  .rowIndexStride(1024)
+                                  .version(fileFormat))) {
+      VectorizedRowBatch batch = schema.createRowBatch();
+      batch.cols[0].noNulls = false;
+      batch.cols[1].noNulls = false;
+      batch.size = 1024;
+      for (int b = 0; b < 3; ++b) {
+        for (int i = 0; i < batch.size; ++i) {
+          if (b == 0 || (b == 2 && i % 2 == 0)) {
+            batch.cols[0].isNull[i] = true; // every other value is null or 1
+            batch.cols[1].isNull[i] = true; // every other value is null or 
"val"
+          } else {
+            batch.cols[0].isNull[i] = false;
+            ((LongColumnVector) batch.cols[0]).vector[i] = 1;
+            batch.cols[1].isNull[i] = false;
+            ((BytesColumnVector) batch.cols[1]).setVal(i, 
"val".getBytes(StandardCharsets.UTF_8));
+          }
+        }
+        writer.addRowBatch(batch);
+      }
+    }
+
+    try (Reader reader = OrcFile.createReader(testFilePath,
+                          OrcFile.readerOptions(conf).filesystem(fs))) {
+      assertEquals(3*1024, reader.getNumberOfRows());
+      VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+
+      // int1 not in (1) -- should select 0 of the row groups
+      SearchArgument sarg =
+          SearchArgumentFactory.newBuilder()
+              .startNot()
+              .in("int1", PredicateLeaf.Type.LONG, 1L)
+              .end().build();
+
+      try (RecordReader rows = 
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+        assertEquals(false, rows.nextBatch(batch));
+      }
+
+
+      // string1 not in ("val") -- should select 0 of the row groups
+      sarg = SearchArgumentFactory.newBuilder()
+          .startNot()
+          .in("string1", PredicateLeaf.Type.STRING, "val")
+          .end().build();
+
+      try (RecordReader rows = 
reader.rows(reader.options().searchArgument(sarg, new String[]{}))) {
+        assertEquals(false, rows.nextBatch(batch));
+      }
+    }
+  }
+
   @Test
   public void testColumnEncryption() throws Exception {
     Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 6ca24a6..9f3ef90 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -378,19 +378,22 @@ public class TestRecordReaderImpl {
     assertEquals(10.0d, RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
            OrcProto.ColumnStatistics.newBuilder()
-             .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
-              .setMinimum(10.0d).setMaximum(100.0d).build()).build()),
+               .setNumberOfValues(1)
+               .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+                 .setMinimum(10.0d).setMaximum(100.0d).build()).build()),
         new StubPredicate(PredicateLeaf.Type.FLOAT), true).lower);
     assertEquals(null, RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
           OrcProto.ColumnStatistics.newBuilder()
+              .setNumberOfValues(1)
               
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
               .build()), new 
StubPredicate(PredicateLeaf.Type.STRING),true).lower);
     assertEquals("a", RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
           OrcProto.ColumnStatistics.newBuilder()
-            .setStringStatistics(OrcProto.StringStatistics.newBuilder()
-            .setMinimum("a").setMaximum("b").build()).build()),
+              .setNumberOfValues(1)
+              .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+                .setMinimum("a").setMaximum("b").build()).build()),
         new StubPredicate(PredicateLeaf.Type.STRING), true).lower);
     assertEquals("hello", RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null, createStringStats("hello", 
"world")),
@@ -403,16 +406,18 @@ public class TestRecordReaderImpl {
 
   private static OrcProto.ColumnStatistics createIntStats(Long min,
                                                           Long max) {
+    OrcProto.ColumnStatistics.Builder result =
+        OrcProto.ColumnStatistics.newBuilder();
     OrcProto.IntegerStatistics.Builder intStats =
         OrcProto.IntegerStatistics.newBuilder();
     if (min != null) {
       intStats.setMinimum(min);
+      result.setNumberOfValues(1);
     }
     if (max != null) {
       intStats.setMaximum(max);
     }
-    return OrcProto.ColumnStatistics.newBuilder()
-        .setIntStatistics(intStats.build()).build();
+    return result.setIntStatistics(intStats.build()).build();
   }
 
   private static OrcProto.ColumnStatistics createBooleanStats(int n, int 
trueCount) {
@@ -428,7 +433,8 @@ public class TestRecordReaderImpl {
     intStats.setMinimum(min);
     intStats.setMaximum(max);
     return OrcProto.ColumnStatistics.newBuilder()
-               .setIntStatistics(intStats.build()).build();
+        .setNumberOfValues(1)
+        .setIntStatistics(intStats.build()).build();
   }
 
   private static OrcProto.ColumnStatistics createDoubleStats(double min, 
double max) {
@@ -436,7 +442,8 @@ public class TestRecordReaderImpl {
     dblStats.setMinimum(min);
     dblStats.setMaximum(max);
     return OrcProto.ColumnStatistics.newBuilder()
-               .setDoubleStatistics(dblStats.build()).build();
+        .setNumberOfValues(1)
+        .setDoubleStatistics(dblStats.build()).build();
   }
 
   //fixme
@@ -445,7 +452,9 @@ public class TestRecordReaderImpl {
     OrcProto.StringStatistics.Builder strStats = 
OrcProto.StringStatistics.newBuilder();
     strStats.setMinimum(min);
     strStats.setMaximum(max);
-    return 
OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+    return OrcProto.ColumnStatistics.newBuilder()
+        .setNumberOfValues(1)
+        .setStringStatistics(strStats.build())
         .setHasNull(hasNull).build();
   }
 
@@ -453,7 +462,9 @@ public class TestRecordReaderImpl {
     OrcProto.StringStatistics.Builder strStats = 
OrcProto.StringStatistics.newBuilder();
     strStats.setMinimum(min);
     strStats.setMaximum(max);
-    return 
OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+    return OrcProto.ColumnStatistics.newBuilder()
+        .setNumberOfValues(1)
+        .setStringStatistics(strStats.build())
                .build();
   }
 
@@ -472,7 +483,8 @@ public class TestRecordReaderImpl {
     tsStats.setMinimumUtc(getUtcTimestamp(min));
     tsStats.setMaximumUtc(getUtcTimestamp(max));
     return OrcProto.ColumnStatistics.newBuilder()
-               .setTimestampStatistics(tsStats.build()).build();
+        .setNumberOfValues(1)
+        .setTimestampStatistics(tsStats.build()).build();
   }
 
   private static OrcProto.ColumnStatistics createDecimalStats(String min, 
String max) {
@@ -484,7 +496,9 @@ public class TestRecordReaderImpl {
     OrcProto.DecimalStatistics.Builder decStats = 
OrcProto.DecimalStatistics.newBuilder();
     decStats.setMinimum(min);
     decStats.setMaximum(max);
-    return 
OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
+    return OrcProto.ColumnStatistics.newBuilder()
+        .setNumberOfValues(1)
+        .setDecimalStatistics(decStats.build())
         .setHasNull(hasNull).build();
   }
 
@@ -496,18 +510,21 @@ public class TestRecordReaderImpl {
     assertEquals(100.0d, RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
           OrcProto.ColumnStatistics.newBuilder()
-            .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+              .setNumberOfValues(1)
+              .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
                 .setMinimum(10.0d).setMaximum(100.0d).build()).build()),
         new StubPredicate(PredicateLeaf.Type.FLOAT), true).upper);
     assertEquals(null, RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
           OrcProto.ColumnStatistics.newBuilder()
-            
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
-            .build()), new StubPredicate(PredicateLeaf.Type.STRING), 
true).upper);
+              .setNumberOfValues(1)
+              
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+              .build()), new StubPredicate(PredicateLeaf.Type.STRING), 
true).upper);
     assertEquals("b", RecordReaderImpl.getValueRange(
         ColumnStatisticsImpl.deserialize(null,
           OrcProto.ColumnStatistics.newBuilder()
-            .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+              .setNumberOfValues(1)
+              .setStringStatistics(OrcProto.StringStatistics.newBuilder()
                 .setMinimum("a").setMaximum("b").build()).build()),
         new StubPredicate(PredicateLeaf.Type.STRING), true).upper);
     assertEquals("world", RecordReaderImpl.getValueRange(

Reply via email to