This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new 261f7d267 Fix notIn for columns with null values. (#1028)
261f7d267 is described below

commit 261f7d2679407c833545b56f4c85a4ae8b5c9ed4
Author: Yujiang Zhong <42907416+zhongyuji...@users.noreply.github.com>
AuthorDate: Wed Feb 15 18:58:32 2023 +0800

    Fix notIn for columns with null values. (#1028)
---
 .../parquet/filter2/dictionarylevel/DictionaryFilter.java    |  8 ++++++++
 .../filter2/dictionarylevel/DictionaryFilterTest.java        | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git 
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
 
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
index c21212ac1..992aaa824 100644
--- 
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
+++ 
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
@@ -447,6 +447,14 @@ public class DictionaryFilter implements 
FilterPredicate.Visitor<Boolean> {
       return BLOCK_MIGHT_MATCH;
     }
 
+    boolean mayContainNull = (meta.getStatistics() == null
+      || !meta.getStatistics().isNumNullsSet()
+      || meta.getStatistics().getNumNulls() > 0);
+    // The column may contain nulls and the values set contains no null, so 
the row group cannot be eliminated.
+    if (mayContainNull) {
+      return BLOCK_MIGHT_MATCH;
+    }
+
     // if the chunk has non-dictionary pages, don't bother decoding the
     // dictionary because the row group can't be eliminated.
     if (hasNonDictionaryPages(meta)) {
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
index 65cefe46f..4fa933e75 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
@@ -35,6 +35,7 @@ import org.apache.parquet.example.data.Group;
 import org.apache.parquet.example.data.simple.SimpleGroupFactory;
 import org.apache.parquet.filter2.predicate.FilterPredicate;
 import org.apache.parquet.filter2.predicate.LogicalInverseRewriter;
+import org.apache.parquet.filter2.predicate.Operators;
 import org.apache.parquet.filter2.predicate.Operators.BinaryColumn;
 import org.apache.parquet.filter2.predicate.Operators.DoubleColumn;
 import org.apache.parquet.filter2.predicate.Operators.FloatColumn;
@@ -504,6 +505,17 @@ public class DictionaryFilterTest {
     FilterPredicate predNotIn4 = notIn(b, set4);
     assertFalse("Should not drop block for null", canDrop(predIn4, ccmd, 
dictionaries));
     assertFalse("Should not drop block for null", canDrop(predNotIn4, ccmd, 
dictionaries));
+
+    BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field");
+
+    // Test the case that all non-null values are in the set but the column 
may have nulls and the set has no nulls.
+    Set<Binary> set5 = new HashSet<>();
+    set5.add(Binary.fromString("sharp"));
+    FilterPredicate predNotIn5 = notIn(sharpAndNull, set5);
+    FilterPredicate predIn5 = in(sharpAndNull, set5);
+    assertFalse("Should not drop block",
+      canDrop(predNotIn5, ccmd, dictionaries));
+    assertFalse("Should not drop block", canDrop(predIn5, ccmd, dictionaries));
   }
 
   @Test

Reply via email to