This is an automated email from the ASF dual-hosted git repository. gabor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push: new 261f7d267 Fix notIn for columns with null values. (#1028) 261f7d267 is described below commit 261f7d2679407c833545b56f4c85a4ae8b5c9ed4 Author: Yujiang Zhong <42907416+zhongyuji...@users.noreply.github.com> AuthorDate: Wed Feb 15 18:58:32 2023 +0800 Fix notIn for columns with null values. (#1028) --- .../parquet/filter2/dictionarylevel/DictionaryFilter.java | 8 ++++++++ .../filter2/dictionarylevel/DictionaryFilterTest.java | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index c21212ac1..992aaa824 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -447,6 +447,14 @@ public class DictionaryFilter implements FilterPredicate.Visitor<Boolean> { return BLOCK_MIGHT_MATCH; } + boolean mayContainNull = (meta.getStatistics() == null + || !meta.getStatistics().isNumNullsSet() + || meta.getStatistics().getNumNulls() > 0); + // The column may contain nulls and the values set contains no null, so the row group cannot be eliminated. + if (mayContainNull) { + return BLOCK_MIGHT_MATCH; + } + // if the chunk has non-dictionary pages, don't bother decoding the // dictionary because the row group can't be eliminated. if (hasNonDictionaryPages(meta)) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java index 65cefe46f..4fa933e75 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java @@ -35,6 +35,7 @@ import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.LogicalInverseRewriter; +import org.apache.parquet.filter2.predicate.Operators; import org.apache.parquet.filter2.predicate.Operators.BinaryColumn; import org.apache.parquet.filter2.predicate.Operators.DoubleColumn; import org.apache.parquet.filter2.predicate.Operators.FloatColumn; @@ -504,6 +505,17 @@ public class DictionaryFilterTest { FilterPredicate predNotIn4 = notIn(b, set4); assertFalse("Should not drop block for null", canDrop(predIn4, ccmd, dictionaries)); assertFalse("Should not drop block for null", canDrop(predNotIn4, ccmd, dictionaries)); + + BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field"); + + // Test the case that all non-null values are in the set but the column may have nulls and the set has no nulls. + Set<Binary> set5 = new HashSet<>(); + set5.add(Binary.fromString("sharp")); + FilterPredicate predNotIn5 = notIn(sharpAndNull, set5); + FilterPredicate predIn5 = in(sharpAndNull, set5); + assertFalse("Should not drop block", + canDrop(predNotIn5, ccmd, dictionaries)); + assertFalse("Should not drop block", canDrop(predIn5, ccmd, dictionaries)); } @Test