This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 261f7d267 Fix notIn for columns with null values. (#1028)
261f7d267 is described below
commit 261f7d2679407c833545b56f4c85a4ae8b5c9ed4
Author: Yujiang Zhong <[email protected]>
AuthorDate: Wed Feb 15 18:58:32 2023 +0800
Fix notIn for columns with null values. (#1028)
---
.../parquet/filter2/dictionarylevel/DictionaryFilter.java | 8 ++++++++
.../filter2/dictionarylevel/DictionaryFilterTest.java | 12 ++++++++++++
2 files changed, 20 insertions(+)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
index c21212ac1..992aaa824 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
@@ -447,6 +447,14 @@ public class DictionaryFilter implements
FilterPredicate.Visitor<Boolean> {
return BLOCK_MIGHT_MATCH;
}
+ boolean mayContainNull = (meta.getStatistics() == null
+ || !meta.getStatistics().isNumNullsSet()
+ || meta.getStatistics().getNumNulls() > 0);
+ // The column may contain nulls and the values set contains no null, so
the row group cannot be eliminated.
+ if (mayContainNull) {
+ return BLOCK_MIGHT_MATCH;
+ }
+
// if the chunk has non-dictionary pages, don't bother decoding the
// dictionary because the row group can't be eliminated.
if (hasNonDictionaryPages(meta)) {
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
index 65cefe46f..4fa933e75 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
@@ -35,6 +35,7 @@ import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.filter2.predicate.LogicalInverseRewriter;
+import org.apache.parquet.filter2.predicate.Operators;
import org.apache.parquet.filter2.predicate.Operators.BinaryColumn;
import org.apache.parquet.filter2.predicate.Operators.DoubleColumn;
import org.apache.parquet.filter2.predicate.Operators.FloatColumn;
@@ -504,6 +505,17 @@ public class DictionaryFilterTest {
FilterPredicate predNotIn4 = notIn(b, set4);
assertFalse("Should not drop block for null", canDrop(predIn4, ccmd,
dictionaries));
assertFalse("Should not drop block for null", canDrop(predNotIn4, ccmd,
dictionaries));
+
+ BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field");
+
+ // Test the case that all non-null values are in the set but the column
may have nulls and the set has no nulls.
+ Set<Binary> set5 = new HashSet<>();
+ set5.add(Binary.fromString("sharp"));
+ FilterPredicate predNotIn5 = notIn(sharpAndNull, set5);
+ FilterPredicate predIn5 = in(sharpAndNull, set5);
+ assertFalse("Should not drop block",
+ canDrop(predNotIn5, ccmd, dictionaries));
+ assertFalse("Should not drop block", canDrop(predIn5, ccmd, dictionaries));
}
@Test